From 13fc6703862862f4263d8d5d085b7a16b87190e1 Mon Sep 17 00:00:00 2001 From: Lorenz Meier Date: Sun, 28 Apr 2013 09:54:11 +0200 Subject: Moved last libs, drivers and headers, cleaned up IO build --- .../arm_biquad_cascade_df1_32x64_init_q31.c | 105 ++ .../arm_biquad_cascade_df1_32x64_q31.c | 553 ++++++++ .../arm_biquad_cascade_df1_f32.c | 421 ++++++ .../arm_biquad_cascade_df1_fast_q15.c | 283 ++++ .../arm_biquad_cascade_df1_fast_q31.c | 275 ++++ .../arm_biquad_cascade_df1_init_f32.c | 107 ++ .../arm_biquad_cascade_df1_init_q15.c | 109 ++ .../arm_biquad_cascade_df1_init_q31.c | 109 ++ .../arm_biquad_cascade_df1_q15.c | 408 ++++++ .../arm_biquad_cascade_df1_q31.c | 400 ++++++ .../arm_biquad_cascade_df2T_f32.c | 377 +++++ .../arm_biquad_cascade_df2T_init_f32.c | 97 ++ .../Source/FilteringFunctions/arm_conv_f32.c | 646 +++++++++ .../FilteringFunctions/arm_conv_fast_opt_q15.c | 538 +++++++ .../Source/FilteringFunctions/arm_conv_fast_q15.c | 1405 +++++++++++++++++++ .../Source/FilteringFunctions/arm_conv_fast_q31.c | 572 ++++++++ .../Source/FilteringFunctions/arm_conv_opt_q15.c | 544 ++++++++ .../Source/FilteringFunctions/arm_conv_opt_q7.c | 434 ++++++ .../FilteringFunctions/arm_conv_partial_f32.c | 661 +++++++++ .../arm_conv_partial_fast_opt_q15.c | 763 ++++++++++ .../FilteringFunctions/arm_conv_partial_fast_q15.c | 1473 ++++++++++++++++++++ .../FilteringFunctions/arm_conv_partial_fast_q31.c | 599 ++++++++ .../FilteringFunctions/arm_conv_partial_opt_q15.c | 764 ++++++++++ .../FilteringFunctions/arm_conv_partial_opt_q7.c | 806 +++++++++++ .../FilteringFunctions/arm_conv_partial_q15.c | 778 +++++++++++ .../FilteringFunctions/arm_conv_partial_q31.c | 599 ++++++++ .../FilteringFunctions/arm_conv_partial_q7.c | 733 ++++++++++ .../Source/FilteringFunctions/arm_conv_q15.c | 733 ++++++++++ .../Source/FilteringFunctions/arm_conv_q31.c | 564 ++++++++ .../Source/FilteringFunctions/arm_conv_q7.c | 689 +++++++++ .../Source/FilteringFunctions/arm_correlate_f32.c | 738 ++++++++++ .../arm_correlate_fast_opt_q15.c | 507 +++++++ .../FilteringFunctions/arm_correlate_fast_q15.c | 1314 +++++++++++++++++ .../FilteringFunctions/arm_correlate_fast_q31.c | 607 ++++++++ .../FilteringFunctions/arm_correlate_opt_q15.c | 512 +++++++ .../FilteringFunctions/arm_correlate_opt_q7.c | 463 ++++++ .../Source/FilteringFunctions/arm_correlate_q15.c | 718 ++++++++++ .../Source/FilteringFunctions/arm_correlate_q31.c | 664 +++++++++ .../Source/FilteringFunctions/arm_correlate_q7.c | 789 +++++++++++ .../FilteringFunctions/arm_fir_decimate_f32.c | 518 +++++++ .../FilteringFunctions/arm_fir_decimate_fast_q15.c | 590 ++++++++ .../FilteringFunctions/arm_fir_decimate_fast_q31.c | 343 +++++ .../FilteringFunctions/arm_fir_decimate_init_f32.c | 112 ++ .../FilteringFunctions/arm_fir_decimate_init_q15.c | 114 ++ .../FilteringFunctions/arm_fir_decimate_init_q31.c | 112 ++ .../FilteringFunctions/arm_fir_decimate_q15.c | 691 +++++++++ .../FilteringFunctions/arm_fir_decimate_q31.c | 306 ++++ .../Source/FilteringFunctions/arm_fir_f32.c | 554 ++++++++ .../Source/FilteringFunctions/arm_fir_fast_q15.c | 341 +++++ .../Source/FilteringFunctions/arm_fir_fast_q31.c | 309 ++++ .../Source/FilteringFunctions/arm_fir_init_f32.c | 94 ++ .../Source/FilteringFunctions/arm_fir_init_q15.c | 152 ++ .../Source/FilteringFunctions/arm_fir_init_q31.c | 94 ++ .../Source/FilteringFunctions/arm_fir_init_q7.c | 92 ++ .../FilteringFunctions/arm_fir_interpolate_f32.c | 574 ++++++++ .../arm_fir_interpolate_init_f32.c | 116 ++ .../arm_fir_interpolate_init_q15.c | 115 ++ .../arm_fir_interpolate_init_q31.c | 116 ++ .../FilteringFunctions/arm_fir_interpolate_q15.c | 503 +++++++ .../FilteringFunctions/arm_fir_interpolate_q31.c | 499 +++++++ .../FilteringFunctions/arm_fir_lattice_f32.c | 499 +++++++ .../FilteringFunctions/arm_fir_lattice_init_f32.c | 78 ++ .../FilteringFunctions/arm_fir_lattice_init_q15.c | 78 ++ .../FilteringFunctions/arm_fir_lattice_init_q31.c | 78 ++ .../FilteringFunctions/arm_fir_lattice_q15.c | 531 +++++++ .../FilteringFunctions/arm_fir_lattice_q31.c | 348 +++++ .../Source/FilteringFunctions/arm_fir_q15.c | 689 +++++++++ .../Source/FilteringFunctions/arm_fir_q31.c | 363 +++++ .../DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c | 388 ++++++ .../Source/FilteringFunctions/arm_fir_sparse_f32.c | 365 +++++ .../FilteringFunctions/arm_fir_sparse_init_f32.c | 102 ++ .../FilteringFunctions/arm_fir_sparse_init_q15.c | 102 ++ .../FilteringFunctions/arm_fir_sparse_init_q31.c | 101 ++ .../FilteringFunctions/arm_fir_sparse_init_q7.c | 102 ++ .../Source/FilteringFunctions/arm_fir_sparse_q15.c | 406 ++++++ .../Source/FilteringFunctions/arm_fir_sparse_q31.c | 370 +++++ .../Source/FilteringFunctions/arm_fir_sparse_q7.c | 398 ++++++ .../FilteringFunctions/arm_iir_lattice_f32.c | 440 ++++++ .../FilteringFunctions/arm_iir_lattice_init_f32.c | 86 ++ .../FilteringFunctions/arm_iir_lattice_init_q15.c | 86 ++ .../FilteringFunctions/arm_iir_lattice_init_q31.c | 86 ++ .../FilteringFunctions/arm_iir_lattice_q15.c | 457 ++++++ .../FilteringFunctions/arm_iir_lattice_q31.c | 345 +++++ .../Source/FilteringFunctions/arm_lms_f32.c | 434 ++++++ .../Source/FilteringFunctions/arm_lms_init_f32.c | 90 ++ .../Source/FilteringFunctions/arm_lms_init_q15.c | 100 ++ .../Source/FilteringFunctions/arm_lms_init_q31.c | 100 ++ .../Source/FilteringFunctions/arm_lms_norm_f32.c | 456 ++++++ .../FilteringFunctions/arm_lms_norm_init_f32.c | 100 ++ .../FilteringFunctions/arm_lms_norm_init_q15.c | 107 ++ .../FilteringFunctions/arm_lms_norm_init_q31.c | 106 ++ .../Source/FilteringFunctions/arm_lms_norm_q15.c | 435 ++++++ .../Source/FilteringFunctions/arm_lms_norm_q31.c | 426 ++++++ .../Source/FilteringFunctions/arm_lms_q15.c | 374 +++++ .../Source/FilteringFunctions/arm_lms_q31.c | 364 +++++ 95 files changed, 38792 insertions(+) create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q7.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_f32.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q31.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q15.c create mode 100644 src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q31.c (limited to 'src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions') diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c new file mode 100644 index 000000000..a6745c0cd --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c @@ -0,0 +1,105 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_32x64_init_q31.c +* +* Description: High precision Q31 Biquad cascade filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1_32x64 + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the high precision Q31 Biquad cascade filter structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format. + * @return none + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
    
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
+ * 
+ * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState points to state variables array and size of each state variable is 1.63 format. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the state array as: + *
    
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cas_df1_32x64_init_q31( + arm_biquad_cas_df1_32x64_ins_q31 * S, + uint8_t numStages, + q31_t * pCoeffs, + q63_t * pState, + uint8_t postShift) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign postShift to be applied to the output */ + S->postShift = postShift; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q63_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1_32x64 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c new file mode 100644 index 000000000..82d6164ee --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c @@ -0,0 +1,553 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_32x64_q31.c +* +* Description: High precision Q31 Biquad cascade filter processing function +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter + * + * This function implements a high precision Biquad cascade filter which operates on + * Q31 data values. The filter coefficients are in 1.31 format and the state variables + * are in 1.63 format. The double precision state variables reduce quantization noise + * in the filter and provide a cleaner output. + * These filters are particularly useful when implementing filters in which the + * singularities are close to the unit circle. This is common for low pass or high + * pass filters with very low cutoff frequencies. + * + * The function operates on blocks of input and output data + * and each call to the function processes blockSize samples through + * the filter. pSrc and pDst points to input and output arrays + * containing blockSize Q31 values. + * + * \par Algorithm + * Each Biquad stage implements a second order filter using the difference equation: + *
    
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
+ * 
+ * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage. + * \image html Biquad.gif "Single Biquad filter stage" + * Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. + * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. + * Pay careful attention to the sign of the feedback coefficients. + * Some design tools use the difference equation + *
    
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]    
+ * 
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. + * + * \par + * Higher order filters are realized as a cascade of second order sections. + * numStages refers to the number of second order stages used. + * For example, an 8th order filter would be realized with numStages=4 second order stages. + * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages" + * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). + * + * \par + * The pState points to state variables array . + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2] and each state variable in 1.63 format to improve precision. + * The state variables are arranged in the array as: + *
    
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
+ * 
+ * + * \par + * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values of data in 1.63 format. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * + * \par Init Function + * There is also an associated initialization function which performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * For example, to statically initialize the filter instance structure use + *
    
+ *     arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};    
+ * 
+ * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer; postShift shift to be applied which is described in detail below. + * \par Fixed-Point Behavior + * Care must be taken while using Biquad Cascade 32x64 filter function. + * Following issues must be considered: + * - Scaling of coefficients + * - Filter gain + * - Overflow and saturation + * + * \par + * Filter coefficients are represented as fractional values and + * restricted to lie in the range [-1 +1). + * The processing function has an additional scaling parameter postShift + * which allows the filter coefficients to exceed the range [+1 -1). + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator" + * This essentially scales the filter coefficients by 2^postShift. + * For example, to realize the coefficients + *
    
+ *    {1.5, -0.8, 1.2, 1.6, -0.9}    
+ * 
+ * set the Coefficient array to: + *
    
+ *    {0.75, -0.4, 0.6, 0.8, -0.45}    
+ * 
+ * and set postShift=1 + * + * \par + * The second thing to keep in mind is the gain through the filter. + * The frequency response of a Biquad filter is a function of its coefficients. + * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies. + * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter. + * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed. + * + * \par + * The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version. + * This is described in the function specific documentation below. + */ + +/** + * @addtogroup BiquadCascadeDF1_32x64 + * @{ + */ + +/** + * @details + + * @param[in] *S points to an instance of the high precision Q31 Biquad cascade filter. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25). + * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to + * 1.31 format by discarding the low 32 bits. + * + * \par + * Two related functions are provided in the CMSIS DSP library. + * arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator. + * arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator. + */ + +void arm_biquad_cas_df1_32x64_q31( + const arm_biquad_cas_df1_32x64_ins_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pIn = pSrc; /* input pointer initialization */ + q31_t *pOut = pDst; /* output pointer initialization */ + q63_t *pState = S->pState; /* state pointer initialization */ + q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */ + q63_t acc; /* accumulator */ + q31_t Xn1, Xn2; /* Input Filter state variables */ + q63_t Yn1, Yn2; /* Output Filter state variables */ + q31_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q31_t Xn; /* temporary input */ + int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */ + uint32_t sample, stage = S->numStages; /* loop counters */ + q31_t acc_l, acc_h; /* temporary output */ + uint32_t uShift = ((uint32_t) S->postShift + 1u); + uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = (q31_t) (pState[0]); + Xn2 = (q31_t) (pState[1]); + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variable acc hold output value that is being computed and + * stored in the destination buffer + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* The result is converted to 1.63 , Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *pOut = acc_h; + + /* Read the second input into Xn2, to reuse the value */ + Xn2 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc += b1 * x[n-1] */ + acc = (q63_t) Xn *b1; + + /* acc = b0 * x[n] */ + acc += (q63_t) Xn2 *b0; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn1 *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn2, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn1, a2); + + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Read the third input into Xn1, to reuse the value */ + Xn1 = *pIn++; + + /* The result is converted to 1.31 */ + /* Store the output in the destination buffer. */ + *(pOut + 1u) = acc_h; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) Xn1 *b0; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn2 *b1; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* The result is converted to 1.63, Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *(pOut + 2u) = acc_h; + + /* Read the fourth input into Xn, to reuse the value */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn2, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn1, a2); + + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *(pOut + 3u) = acc_h; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + + /* update output pointer */ + pOut += 4u; + + /* decrement the loop counter */ + sample--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = (blockSize & 0x3u); + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *pOut++ = acc_h; + //Yn1 = acc << shift; + + /* Store the output in the destination buffer in 1.31 format. */ +// *pOut++ = (q31_t) (acc >> (32 - shift)); + + /* decrement the loop counter */ + sample--; + } + + /* The first stage output is given as input to the second stage. */ + pIn = pDst; + + /* Reset to destination buffer working pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + /* Store the updated state variables back into the pState array */ + *pState++ = (q63_t) Xn1; + *pState++ = (q63_t) Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while(--stage); + +#else + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variable acc hold output value that is being computed and + * stored in the destination buffer + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *pOut++ = acc_h; + + //Yn1 = acc << shift; + + /* Store the output in the destination buffer in 1.31 format. */ + //*pOut++ = (q31_t) (acc >> (32 - shift)); + + /* decrement the loop counter */ + sample--; + } + + /* The first stage output is given as input to the second stage. */ + pIn = pDst; + + /* Reset to destination buffer working pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = (q63_t) Xn1; + *pState++ = (q63_t) Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while(--stage); + +#endif /* #ifndef ARM_MATH_CM0 */ +} + + /** + * @} end of BiquadCascadeDF1_32x64 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c new file mode 100644 index 000000000..ee20dfeec --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c @@ -0,0 +1,421 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_f32.c +* +* Description: Processing function for the +* floating-point Biquad cascade DirectFormI(DF1) filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure + * + * This set of functions implements arbitrary order recursive (IIR) filters. + * The filters are implemented as a cascade of second order Biquad sections. + * The functions support Q15, Q31 and floating-point data types. + * Fast version of Q15 and Q31 also supported on CortexM4 and Cortex-M3. + * + * The functions operate on blocks of input and output data and each call to the function + * processes blockSize samples through the filter. + * pSrc points to the array of input data and + * pDst points to the array of output data. + * Both arrays contain blockSize values. + * + * \par Algorithm + * Each Biquad stage implements a second order filter using the difference equation: + *
    
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
+ * 
+ * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage. + * \image html Biquad.gif "Single Biquad filter stage" + * Coefficients b0, b1 and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. + * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. + * Pay careful attention to the sign of the feedback coefficients. + * Some design tools use the difference equation + *
    
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]    
+ * 
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. + * + * \par + * Higher order filters are realized as a cascade of second order sections. + * numStages refers to the number of second order stages used. + * For example, an 8th order filter would be realized with numStages=4 second order stages. + * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages" + * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). + * + * \par + * The pState points to state variables array. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
    
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
+ * 
+ * + * \par + * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed, the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Init Functions + * There is also an associated initialization function for each data type. + * The initialization function performs following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 3 different data type filter instance structures + *
    
+ *     arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};    
+ *     arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};    
+ *     arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};    
+ * 
+ * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer; postShift shift to be applied. + * + * \par Fixed-Point Behavior + * Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions. + * Following issues must be considered: + * - Scaling of coefficients + * - Filter gain + * - Overflow and saturation + * + * \par + * Scaling of coefficients: + * Filter coefficients are represented as fractional values and + * coefficients are restricted to lie in the range [-1 +1). + * The fixed-point functions have an additional scaling parameter postShift + * which allow the filter coefficients to exceed the range [+1 -1). + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator" + * This essentially scales the filter coefficients by 2^postShift. + * For example, to realize the coefficients + *
    
+ *    {1.5, -0.8, 1.2, 1.6, -0.9}    
+ * 
+ * set the pCoeffs array to: + *
    
+ *    {0.75, -0.4, 0.6, 0.8, -0.45}    
+ * 
+ * and set postShift=1 + * + * \par + * Filter gain: + * The frequency response of a Biquad filter is a function of its coefficients. + * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies. + * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter. + * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed. + * + * \par + * Overflow and saturation: + * For Q15 and Q31 versions, it is described separately as part of the function specific documentation below. + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @param[in] *S points to an instance of the floating-point Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + */ + +void arm_biquad_cascade_df1_f32( + const arm_biquad_casd_df1_inst_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pIn = pSrc; /* source pointer */ + float32_t *pOut = pDst; /* destination pointer */ + float32_t *pState = S->pState; /* pState pointer */ + float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */ + float32_t acc; /* Simulates the accumulator */ + float32_t b0, b1, b2, a1, a2; /* Filter coefficients */ + float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */ + float32_t Xn; /* temporary input */ + uint32_t sample, stage = S->numStages; /* loop counters */ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the pState values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(sample > 0u) + { + /* Read the first input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn2; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + + /* Read the second input */ + Xn2 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn1; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + + /* Read the third input */ + Xn1 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn2; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + + /* Read the forth input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn1; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + + /* decrement the loop counter */ + sample--; + + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = blockSize & 0x3u; + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = acc; + + /* decrement the loop counter */ + sample--; + + } + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent numStages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while(stage > 0u); + +#else + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the pState values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variables acc holds the output value that is computed: + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = acc; + + /* decrement the loop counter */ + sample--; + } + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent numStages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while(stage > 0u); + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + + + /** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c new file mode 100644 index 000000000..29afffa03 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c @@ -0,0 +1,283 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_fast_q15.c +* +* Description: Fast processing function for the +* Q15 Biquad cascade filter. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.9 2010/08/16 +* Initial version +* +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * @param[in] *S points to an instance of the Q15 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around and distorts the result. + * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). + * The 2.30 accumulator is then shifted by postShift bits and the result truncated to 1.15 format by discarding the low 16 bits. + * + * \par + * Refer to the function arm_biquad_cascade_df1_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure. + * Use the function arm_biquad_cascade_df1_init_q15() to initialize the filter structure. + * + */ + +void arm_biquad_cascade_df1_fast_q15( + const arm_biquad_casd_df1_inst_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pIn = pSrc; /* Source pointer */ + q15_t *pOut = pDst; /* Destination pointer */ + q31_t in; /* Temporary variable to hold input value */ + q31_t out; /* Temporary variable to hold output value */ + q31_t b0; /* Temporary variable to hold bo value */ + q31_t b1, a1; /* Filter coefficients */ + q31_t state_in, state_out; /* Filter state variables */ + q31_t acc; /* Accumulator */ + int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + uint32_t sample, stage = S->numStages; /* Stage loop counter */ + + + + do + { + + /* Read the b0 and 0 coefficients using SIMD */ + b0 = *__SIMD32(pCoeffs)++; + + /* Read the b1 and b2 coefficients using SIMD */ + b1 = *__SIMD32(pCoeffs)++; + + /* Read the a1 and a2 coefficients using SIMD */ + a1 = *__SIMD32(pCoeffs)++; + + /* Read the input state values from the state buffer: x[n-1], x[n-2] */ + state_in = *__SIMD32(pState)++; + + /* Read the output state values from the state buffer: y[n-1], y[n-2] */ + state_out = *__SIMD32(pState)--; + + /* Apply loop unrolling and compute 2 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + sample = blockSize >> 1u; + + /* First part of the processing with loop unrolling. Compute 2 outputs at a time. + ** a second loop below computes the remaining 1 sample. */ + while(sample > 0u) + { + + /* Read the input */ + in = *__SIMD32(pIn)++; + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUAD(b0, in); + /* acc = b1 * x[n-1] + acc += b2 * x[n-2] + out */ + acc = __SMLAD(b1, state_in, out); + /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */ + acc = __SMLAD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 and then saturation is applied */ + out = __SSAT((acc >> shift), 16); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, (in >> 16), 16); + state_out = __PKHBT(state_out >> 16, (out), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUADX(b0, in); + /* acc0 = b1 * x[n-1] , acc0 += b2 * x[n-2] + out */ + acc = __SMLAD(b1, state_in, out); + /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */ + acc = __SMLAD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 and then saturation is applied */ + out = __SSAT((acc >> shift), 16); + + + /* Store the output in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in >> 16, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Decrement the loop counter */ + sample--; + + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + + if((blockSize & 0x1u) != 0u) + { + /* Read the input */ + in = *pIn++; + + /* out = b0 * x[n] + 0 * 0 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + out = __SMUAD(b0, in); + +#else + + out = __SMUADX(b0, in); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc = b1 * x[n-1], acc += b2 * x[n-2] + out */ + acc = __SMLAD(b1, state_in, out); + /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */ + acc = __SMLAD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 and then saturation is applied */ + out = __SSAT((acc >> shift), 16); + + /* Store the output in the destination buffer. */ + *pOut++ = (q15_t) out; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent (numStages - 1) occur in-place in the output buffer */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* Store the updated state variables back into the state array */ + *__SIMD32(pState)++ = state_in; + *__SIMD32(pState)++ = state_out; + + + /* Decrement the loop counter */ + stage--; + + } while(stage > 0u); +} + + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c new file mode 100644 index 000000000..0a479fe6b --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c @@ -0,0 +1,275 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_fast_q31.c +* +* Description: Processing function for the +* Q31 Fast Biquad cascade DirectFormI(DF1) filter. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.9 2010/08/27 +* Initial version +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * + * @param[in] *S points to an instance of the Q31 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are added to a 2.30 accumulator. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function + * arm_biquad_cascade_df1_init_q31() to initialize filter structure. + * + * \par + * Refer to the function arm_biquad_cascade_df1_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure. + * Use the function arm_biquad_cascade_df1_init_q31() to initialize the filter structure. + */ + +void arm_biquad_cascade_df1_fast_q31( + const arm_biquad_casd_df1_inst_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t acc; /* accumulator */ + q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */ + q31_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q31_t *pIn = pSrc; /* input pointer initialization */ + q31_t *pOut = pDst; /* output pointer initialization */ + q31_t *pState = S->pState; /* pState pointer initialization */ + q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */ + q31_t Xn; /* temporary input */ + int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */ + uint32_t sample, stage = S->numStages; /* loop counters */ + + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variables acc ... acc3 hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q31_t) (((q63_t) b1 * Xn1) >> 32); + /* acc += b1 * x[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32); + /* acc += b[2] * x[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32); + /* acc += a1 * y[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32); + /* acc += a2 * y[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32); + + /* The result is converted to 1.31 , Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Read the second input */ + Xn2 = *(pIn + 1u); + + /* Store the output in the destination buffer. */ + *pOut = Yn2; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32); + /* acc += b1 * x[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32); + /* acc += b[2] * x[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32); + /* acc += a1 * y[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32); + /* acc += a2 * y[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32); + + /* The result is converted to 1.31, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Read the third input */ + Xn1 = *(pIn + 2u); + + /* Store the output in the destination buffer. */ + *(pOut + 1u) = Yn1; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32); + /* acc += b1 * x[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32); + /* acc += b[2] * x[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32); + /* acc += a1 * y[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32); + /* acc += a2 * y[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32); + + /* The result is converted to 1.31, Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Read the forth input */ + Xn = *(pIn + 3u); + + /* Store the output in the destination buffer. */ + *(pOut + 2u) = Yn2; + pIn += 4u; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32); + /* acc += b1 * x[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32); + /* acc += b[2] * x[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32); + /* acc += a1 * y[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32); + /* acc += a2 * y[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + Xn2 = Xn1; + + /* The result is converted to 1.31, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Xn1 = Xn */ + Xn1 = Xn; + + /* Store the output in the destination buffer. */ + *(pOut + 3u) = Yn1; + pOut += 4u; + + /* decrement the loop counter */ + sample--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = (blockSize & 0x3u); + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32); + /* acc += b1 * x[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32); + /* acc += b[2] * x[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32); + /* acc += a1 * y[n-1] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32); + /* acc += a2 * y[n-2] */ + acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32); + /* The result is converted to 1.31 */ + acc = acc << shift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = acc; + + /* Store the output in the destination buffer. */ + *pOut++ = acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while(--stage); +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c new file mode 100644 index 000000000..d50b69f3c --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c @@ -0,0 +1,107 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_init_f32.c +* +* Description: floating-point Biquad cascade DirectFormI(DF1) filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * @brief Initialization function for the floating-point Biquad cascade filter. + * @param[in,out] *S points to an instance of the floating-point Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients array. + * @param[in] *pState points to the state array. + * @return none + * + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
    
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
+ * 
+ * + * \par + * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState is a pointer to state array. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
    
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * + */ + +void arm_biquad_cascade_df1_init_f32( + arm_biquad_casd_df1_inst_f32 * S, + uint8_t numStages, + float32_t * pCoeffs, + float32_t * pState) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c new file mode 100644 index 000000000..d5fda28ac --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c @@ -0,0 +1,109 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_init_q15.c +* +* Description: Q15 Biquad cascade DirectFormI(DF1) filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the Q15 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] postShift Shift to be applied to the accumulator result. Varies according to the coefficients format + * @return none + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
    
+ *     {b10, 0, b11, b12, a11, a12, b20, 0, b21, b22, a21, a22, ...}    
+ * 
+ * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 6*numStages values. + * The zero coefficient between b1 and b2 facilities use of 16-bit SIMD instructions on the Cortex-M4. + * + * \par + * The state variables are stored in the array pState. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
    
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_df1_init_q15( + arm_biquad_casd_df1_inst_q15 * S, + uint8_t numStages, + q15_t * pCoeffs, + q15_t * pState, + int8_t postShift) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign postShift to be applied to the output */ + S->postShift = postShift; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c new file mode 100644 index 000000000..dbbb8aa2b --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c @@ -0,0 +1,109 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_init_q31.c +* +* Description: Q31 Biquad cascade DirectFormI(DF1) filter initialization function. +* +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the Q31 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format + * @return none + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
    
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
+ * 
+ * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState points to state variables array. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
    
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_df1_init_q31( + arm_biquad_casd_df1_inst_q31 * S, + uint8_t numStages, + q31_t * pCoeffs, + q31_t * pState, + int8_t postShift) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign postShift to be applied to the output */ + S->postShift = postShift; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c new file mode 100644 index 000000000..484cd85e8 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c @@ -0,0 +1,408 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_q15.c +* +* Description: Processing function for the +* Q15 Biquad cascade DirectFormI(DF1) filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @brief Processing function for the Q15 Biquad cascade filter. + * @param[in] *S points to an instance of the Q15 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * The accumulator is then shifted by postShift bits to truncate the result to 1.15 format by discarding the low 16 bits. + * Finally, the result is saturated to 1.15 format. + * + * \par + * Refer to the function arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4. + */ + +void arm_biquad_cascade_df1_q15( + const arm_biquad_casd_df1_inst_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn = pSrc; /* Source pointer */ + q15_t *pOut = pDst; /* Destination pointer */ + q31_t in; /* Temporary variable to hold input value */ + q31_t out; /* Temporary variable to hold output value */ + q31_t b0; /* Temporary variable to hold bo value */ + q31_t b1, a1; /* Filter coefficients */ + q31_t state_in, state_out; /* Filter state variables */ + q31_t acc_l, acc_h; + q63_t acc; /* Accumulator */ + int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */ + int32_t uShift = (32 - lShift); + + do + { + /* Read the b0 and 0 coefficients using SIMD */ + b0 = *__SIMD32(pCoeffs)++; + + /* Read the b1 and b2 coefficients using SIMD */ + b1 = *__SIMD32(pCoeffs)++; + + /* Read the a1 and a2 coefficients using SIMD */ + a1 = *__SIMD32(pCoeffs)++; + + /* Read the input state values from the state buffer: x[n-1], x[n-2] */ + state_in = *__SIMD32(pState)++; + + /* Read the output state values from the state buffer: y[n-1], y[n-2] */ + state_out = *__SIMD32(pState)--; + + /* Apply loop unrolling and compute 2 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + sample = blockSize >> 1u; + + /* First part of the processing with loop unrolling. Compute 2 outputs at a time. + ** a second loop below computes the remaining 1 sample. */ + while(sample > 0u) + { + + /* Read the input */ + in = *__SIMD32(pIn)++; + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUAD(b0, in); + + /* acc += b1 * x[n-1] + b2 * x[n-2] + out */ + acc = __SMLALD(b1, state_in, out); + /* acc += a1 * y[n-1] + a2 * y[n-2] */ + acc = __SMLALD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + out = (uint32_t) acc_l >> lShift | acc_h << uShift; + + out = __SSAT(out, 16); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, (in >> 16), 16); + state_out = __PKHBT(state_out >> 16, (out), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUADX(b0, in); + /* acc += b1 * x[n-1] + b2 * x[n-2] + out */ + acc = __SMLALD(b1, state_in, out); + /* acc += a1 * y[n-1] + a2 * y[n-2] */ + acc = __SMLALD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + out = (uint32_t) acc_l >> lShift | acc_h << uShift; + + out = __SSAT(out, 16); + + /* Store the output in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in >> 16, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Decrement the loop counter */ + sample--; + + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + + if((blockSize & 0x1u) != 0u) + { + /* Read the input */ + in = *pIn++; + + /* out = b0 * x[n] + 0 * 0 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + out = __SMUAD(b0, in); + +#else + + out = __SMUADX(b0, in); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc = b1 * x[n-1] + b2 * x[n-2] + out */ + acc = __SMLALD(b1, state_in, out); + /* acc += a1 * y[n-1] + a2 * y[n-2] */ + acc = __SMLALD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + out = (uint32_t) acc_l >> lShift | acc_h << uShift; + + out = __SSAT(out, 16); + + /* Store the output in the destination buffer. */ + *pOut++ = (q15_t) out; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + } + + /* The first stage goes from the input wire to the output wire. */ + /* Subsequent numStages occur in-place in the output wire */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* Store the updated state variables back into the state array */ + *__SIMD32(pState)++ = state_in; + *__SIMD32(pState)++ = state_out; + + + /* Decrement the loop counter */ + stage--; + + } while(stage > 0u); + +#else + + /* Run the below code for Cortex-M0 */ + + q15_t *pIn = pSrc; /* Source pointer */ + q15_t *pOut = pDst; /* Destination pointer */ + q15_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */ + q15_t Xn; /* temporary input */ + q63_t acc; /* Accumulator */ + int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variables acc holds the output value that is computed: + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q31_t) b0 *Xn; + + /* acc += b1 * x[n-1] */ + acc += (q31_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q31_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q31_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q31_t) a2 *Yn2; + + /* The result is converted to 1.31 */ + acc = __SSAT((acc >> shift), 16); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = (q15_t) acc; + + /* Store the output in the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while(--stage); + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c new file mode 100644 index 000000000..5626bdd1c --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c @@ -0,0 +1,400 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df1_q31.c +* +* Description: Processing function for the +* Q31 Biquad cascade filter +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @brief Processing function for the Q31 Biquad cascade filter. + * @param[in] *S points to an instance of the Q31 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25). + * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to + * 1.31 format by discarding the low 32 bits. + * + * \par + * Refer to the function arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4. + */ + +void arm_biquad_cascade_df1_q31( + const arm_biquad_casd_df1_inst_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q63_t acc; /* accumulator */ + uint32_t uShift = ((uint32_t) S->postShift + 1u); + uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */ + q31_t *pIn = pSrc; /* input pointer initialization */ + q31_t *pOut = pDst; /* output pointer initialization */ + q31_t *pState = S->pState; /* pState pointer initialization */ + q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */ + q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */ + q31_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q31_t Xn; /* temporary input */ + uint32_t sample, stage = S->numStages; /* loop counters */ + + +#ifndef ARM_MATH_CM0 + + q31_t acc_l, acc_h; /* temporary output variables */ + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31 , Yn2 variable is reused */ + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn2; + + /* Read the second input */ + Xn2 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn2; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn1; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn2; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn1; + + + /* The result is converted to 1.31, Yn1 variable is reused */ + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + + /* Apply shift for lower part of acc and upper part of acc */ + Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn1; + + /* Read the third input */ + Xn1 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn1; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn2; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31, Yn2 variable is reused */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + + /* Apply shift for lower part of acc and upper part of acc */ + Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn2; + + /* Read the forth input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn2; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn1; + + /* The result is converted to 1.31, Yn1 variable is reused */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn1; + + /* decrement the loop counter */ + sample--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = (blockSize & 0x3u); + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31 */ + acc = acc >> lShift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = (q31_t) acc; + + /* Store the output in the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while(--stage); + +#else + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variables acc holds the output value that is computed: + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31 */ + acc = acc >> lShift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = (q31_t) acc; + + /* Store the output in the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while(--stage); + +#endif /* #ifndef ARM_MATH_CM0 */ +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c new file mode 100644 index 000000000..a8cb0c98c --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c @@ -0,0 +1,377 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df2T_f32.c +* +* Description: Processing function for the floating-point transposed +* direct form II Biquad cascade filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure + * + * This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure. + * The filters are implemented as a cascade of second order Biquad sections. + * These functions provide a slight memory savings as compared to the direct form I Biquad filter functions. + * Only floating-point data is supported. + * + * This function operate on blocks of input and output data and each call to the function + * processes blockSize samples through the filter. + * pSrc points to the array of input data and + * pDst points to the array of output data. + * Both arrays contain blockSize values. + * + * \par Algorithm + * Each Biquad stage implements a second order filter using the difference equation: + *
       
+ *    y[n] = b0 * x[n] + d1       
+ *    d1 = b1 * x[n] + a1 * y[n] + d2       
+ *    d2 = b2 * x[n] + a2 * y[n]       
+ * 
+ * where d1 and d2 represent the two state values. + * + * \par + * A Biquad filter using a transposed Direct Form II structure is shown below. + * \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" + * Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. + * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. + * Pay careful attention to the sign of the feedback coefficients. + * Some design tools flip the sign of the feedback coefficients: + *
       
+ *    y[n] = b0 * x[n] + d1;       
+ *    d1 = b1 * x[n] - a1 * y[n] + d2;       
+ *    d2 = b2 * x[n] - a2 * y[n];       
+ * 
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. + * + * \par + * Higher order filters are realized as a cascade of second order sections. + * numStages refers to the number of second order stages used. + * For example, an 8th order filter would be realized with numStages=4 second order stages. + * A 9th order filter would be realized with numStages=5 second order stages with the + * coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). + * + * \par + * pState points to the state variable array. + * Each Biquad stage has 2 state variables d1 and d2. + * The state variables are arranged in the pState array as: + *
       
+ *     {d11, d12, d21, d22, ...}       
+ * 
+ * where d1x refers to the state variables for the first Biquad and + * d2x refers to the state variables for the second Biquad. + * The state array has a total length of 2*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * + * \par + * The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II. + * The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types. + * That is why the Direct Form I structure supports Q15 and Q31 data types. + * The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2. + * Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad. + * The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * + * \par Init Functions + * There is also an associated initialization function. + * The initialization function performs following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * For example, to statically initialize the instance structure use + *
       
+ *     arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};       
+ * 
+ * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer. + * pCoeffs is the address of the coefficient buffer; + * + */ + +/** + * @addtogroup BiquadCascadeDF2T + * @{ + */ + +/** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in] *S points to an instance of the filter data structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_biquad_cascade_df2T_f32( + const arm_biquad_cascade_df2T_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + + float32_t *pIn = pSrc; /* source pointer */ + float32_t *pOut = pDst; /* destination pointer */ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */ + float32_t acc0; /* accumulator */ + float32_t b0, b1, b2, a1, a2; /* Filter coefficients */ + float32_t Xn; /* temporary input */ + float32_t d1, d2; /* state variables */ + uint32_t sample, stage = S->numStages; /* loop counters */ + +#ifndef ARM_MATH_CM0 + + float32_t Xn1, Xn2; /* Input State variables */ + float32_t acc1; /* accumulator */ + + + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + sample = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(sample > 0u) + { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* Read the first input */ + Xn1 = *pIn++; + + /* y[n] = b0 * x[n] + d1 */ + acc0 = (b0 * Xn1) + d1; + + /* d1 = b1 * x[n] + d2 */ + d1 = (b1 * Xn1) + d2; + + /* d2 = b2 * x[n] */ + d2 = (b2 * Xn1); + + /* Read the second input */ + Xn2 = *pIn++; + + /* d1 = b1 * x[n] + a1 * y[n] */ + d1 = (a1 * acc0) + d1; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + + d2 = (a2 * acc0) + d2; + + /* y[n] = b0 * x[n] + d1 */ + acc1 = (b0 * Xn2) + d1; + + /* Read the third input */ + Xn1 = *pIn++; + + d1 = (b1 * Xn2) + d2; + + d2 = (b2 * Xn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc1; + + d1 = (a1 * acc1) + d1; + + d2 = (a2 * acc1) + d2; + + /* y[n] = b0 * x[n] + d1 */ + acc0 = (b0 * Xn1) + d1; + + d1 = (b1 * Xn1) + d2; + + d2 = (b2 * Xn1); + + /* Read the fourth input */ + Xn2 = *pIn++; + + d1 = (a1 * acc0) + d1; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + + d2 = (a2 * acc0) + d2; + + /* y[n] = b0 * x[n] + d1 */ + acc1 = (b0 * Xn2) + d1; + + d1 = (b1 * Xn2) + d2; + + d2 = (b2 * Xn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc1; + + d1 = (a1 * acc1) + d1; + + d2 = (a2 * acc1) + d2; + + /* decrement the loop counter */ + sample--; + + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = blockSize & 0x3u; + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* y[n] = b0 * x[n] + d1 */ + acc0 = (b0 * Xn) + d1; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + + /* Every time after the output is computed state should be updated. */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + d1 = ((b1 * Xn) + (a1 * acc0)) + d2; + + /* d2 = b2 * x[n] + a2 * y[n] */ + d2 = (b2 * Xn) + (a2 * acc0); + + /* decrement the loop counter */ + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1; + *pState++ = d2; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while(stage > 0u); + +#else + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + + sample = blockSize; + + while(sample > 0u) + { + /* Read the input */ + Xn = *pIn++; + + /* y[n] = b0 * x[n] + d1 */ + acc0 = (b0 * Xn) + d1; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + + /* Every time after the output is computed state should be updated. */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + d1 = ((b1 * Xn) + (a1 * acc0)) + d2; + + /* d2 = b2 * x[n] + a2 * y[n] */ + d2 = (b2 * Xn) + (a2 * acc0); + + /* decrement the loop counter */ + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1; + *pState++ = d2; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while(stage > 0u); + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + + + /** + * @} end of BiquadCascadeDF2T group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c new file mode 100644 index 000000000..e4225d008 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c @@ -0,0 +1,97 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_biquad_cascade_df2T_init_f32.c +* +* Description: Initialization function for the floating-point transposed +* direct form II Biquad cascade filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF2T + * @{ + */ + +/** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] *S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @return none + * + * Coefficient and State Ordering: + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
    
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
+ * 
+ * + * \par + * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState is a pointer to state array. + * Each Biquad stage has 2 state variables d1, and d2. + * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on. + * The state array has a total length of 2*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_df2T_init_f32( + arm_biquad_cascade_df2T_instance_f32 * S, + uint8_t numStages, + float32_t * pCoeffs, + float32_t * pState) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 2 * numStages */ + memset(pState, 0, (2u * (uint32_t) numStages) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF2T group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c new file mode 100644 index 000000000..48dd45f2b --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c @@ -0,0 +1,646 @@ +/* ---------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_f32.c +* +* Description: Convolution of floating-point sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup Conv Convolution + * + * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector. + * Convolution is similar to correlation and is frequently used in filtering and data analysis. + * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types. + * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3. + * + * \par Algorithm + * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively. + * Then the convolution + * + *
    
+ *                   c[n] = a[n] * b[n]    
+ * 
+ * + * \par + * is defined as + * \image html ConvolutionEquation.gif + * \par + * Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2. + * pSrcA points to the first input vector of length srcALen and + * pSrcB points to the second input vector of length srcBLen. + * The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result. + * + * \par + * Conceptually, when two signals a[n] and b[n] are convolved, + * the signal b[n] slides over a[n]. + * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together. + * + * \par + * Note that convolution is a commutative operation: + * + *
    
+ *                   a[n] * b[n] = b[n] * a[n].    
+ * 
+ * + * \par + * This means that switching the A and B arguments to the convolution functions has no effect. + * + * Fixed-Point Behavior + * + * \par + * Convolution requires summing up a large number of intermediate products. + * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. + * Refer to the function specific documentation below for further details of the particular algorithm used. + * + * + * Fast Versions + * + * \par + * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires + * the input signals should be scaled down to avoid intermediate overflows. + * + * + * Opt Versions + * + * \par + * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. + * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of floating-point sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + */ + +void arm_conv_f32( + float32_t * pSrcA, + uint32_t srcALen, + float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t *pIn1; /* inputA pointer */ + float32_t *pIn2; /* inputB pointer */ + float32_t *pOut = pDst; /* output pointer */ + float32_t *px; /* Intermediate inputA pointer */ + float32_t *py; /* Intermediate inputB pointer */ + float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ + float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* x[1] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* x[2] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* x[3] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += x0 * c0; + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += x1 * c0; + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += x2 * c0; + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 += x3 * c0; + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px + 1u); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += x1 * c0; + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += x2 * c0; + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += x3 * c0; + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 += x0 * c0; + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px + 2u); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += x2 * c0; + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += x3 * c0; + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += x0 * c0; + /* acc3 += x[5] * y[srcBLen - 2] */ + acc3 += x1 * c0; + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px + 3u); + px += 4u; + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 += x3 * c0; + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 += x0 * c0; + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 += x1 * c0; + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 += x2 * c0; + + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += x0 * c0; + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += x1 * c0; + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += x2 * c0; + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += x3 * c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + *pOut++ = acc1; + *pOut++ = acc2; + *pOut++ = acc3; + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB; /* inputB pointer */ + float32_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + + /* Loop to calculate convolution for output length number of times */ + for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++) + { + /* Initialize sum with zero to carry out MAC operations */ + sum = 0.0f; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0u; j <= i; j++) + { + /* Check the array limitations */ + if((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += pIn1[j] * pIn2[i - j]; + } + } + /* Store the output in the destination buffer */ + pDst[i] = sum; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c new file mode 100644 index 000000000..3e85a25ba --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c @@ -0,0 +1,538 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_fast_opt_q15.c +* +* Description: Fast Q15 Convolution. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results + * but provides only a single guard bit. There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. + */ + +void arm_conv_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + uint32_t tapCnt; /* loop count */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + + /* Copy smaller length input sequence in reverse order into second scratch buffer */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Assuming scratch1 buffer is aligned by 32-bit */ + /* Fill (srcBLen - 1u) zeros in scratch1 buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update pointer */ + pScr1 += (srcBLen - 1u); + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1u) % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + /* First part of the processing with loop unrolling process 4 data points at a time. + ** a second loop below process for the remaining 1 to 3 samples. */ + + /* Actual convolution process starts here */ + blkCnt = (srcALen + srcBLen - 1u) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2u); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLAD(x2, y2, acc0); + acc2 = __SMLAD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + acc1 = __SMLADX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2u); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x1, y1, acc0); + + acc2 = __SMLAD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLADX(x3, y1, acc1); + + a = *pScr1; + b = *(pScr1 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + acc1 = __SMLADX(x3, y2, acc1); + + a = *(pScr1 + 2); + b = *(pScr1 + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* update scratch pointers */ + pIn2 += 4u; + pScr1 += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + + + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = (srcALen + srcBLen - 1u) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + acc0 += (*pScr1++ * *pIn2++); + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1u; + + } + +} + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c new file mode 100644 index 000000000..e21be3523 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c @@ -0,0 +1,1405 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_fast_q15.c +* +* Description: Fast Q15 Convolution. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results + * but provides only a single guard bit. There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. + */ + +void arm_conv_fast_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ +#ifndef UNALIGNED_SUPPORT_DISABLE + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while((count < 4u) && (blockSize1 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + (count - 1u); + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0u; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + py = py - 1u; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2u; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4u; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + /* Read y[srcBLen - 7] */ + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16); + *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16); + *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + pIn2 = pSrc2 - 1u; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = blockSize3 >> 2u; + + while((j > 0u) && (blockSize3 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1u; + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ + q15_t a, b; + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while((count < 4u) && (blockSize1 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + (count - 1u); + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0u; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + py = py - 1u; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + a = *px++; + b = *px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *px; + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *px; + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x2 = __PKHBT(a, b, 16); + a = *(px + 2); + x3 = __PKHBT(b, a, 16); + +#else + + x2 = __PKHBT(b, a, 16); + a = *(px + 2); + x3 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5], x[6] */ + a = *(px + 2); + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 4); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 4); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 4u; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + a = *px; + b = *(px+1); + px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + /* Read y[srcBLen - 7] */ + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + a = *(px+2); + b = *(px+3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q15_t)(acc0 >> 15); + *pOut++ = (q15_t)(acc1 >> 15); + *pOut++ = (q15_t)(acc2 >> 15); + *pOut++ = (q15_t)(acc3 >> 15); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + pIn2 = pSrc2 - 1u; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = blockSize3 >> 2u; + + while((j > 0u) && (blockSize3 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while(k > 0u) + { + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1u; + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ +} + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c new file mode 100644 index 000000000..e675c11ce --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c @@ -0,0 +1,572 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_fast_q31.c +* +* Description: Q31 Convolution (fast version). +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are accumulated in a 32-bit register in 2.30 format. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * + * \par + * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signals must be scaled down. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * + * \par + * See arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. + */ + +void arm_conv_fast_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[1] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[2] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[3] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc1 += x[3] * y[srcBLen - 3] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc2 += x[4] * y[srcBLen - 3] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc3 += x[5] * y[srcBLen - 3] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); + + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q31_t) (acc0 << 1); + *pOut++ = (q31_t) (acc1 << 1); + *pOut++ = (q31_t) (acc2 << 1); + *pOut++ = (q31_t) (acc3 << 1); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +} + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q15.c new file mode 100644 index 000000000..70b4125d9 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q15.c @@ -0,0 +1,544 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_opt_q15.c +* +* Description: Convolution of Q15 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * + * \par + * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * + */ + +void arm_conv_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + q63_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + uint32_t tapCnt; /* loop count */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + /* Copy smaller length input sequence in reverse order into second scratch buffer */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Assuming scratch1 buffer is aligned by 32-bit */ + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + +#endif + + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update pointer */ + pScr1 += (srcBLen - 1u); + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1u) % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + /* First part of the processing with loop unrolling process 4 data points at a time. + ** a second loop below process for the remaining 1 to 3 samples. */ + + /* Actual convolution process starts here */ + blkCnt = (srcALen + srcBLen - 1u) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2u); + + /* multiply and accumlate */ + acc0 = __SMLALD(x1, y1, acc0); + acc2 = __SMLALD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLALDX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLALD(x2, y2, acc0); + acc2 = __SMLALD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + acc1 = __SMLALDX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2u); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x1, y1, acc0); + + acc2 = __SMLALD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLALDX(x3, y1, acc1); + + a = *pScr1; + b = *(pScr1 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x2, y2, acc0); + + acc2 = __SMLALD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + + acc1 = __SMLALDX(x3, y2, acc1); + + a = *(pScr1 + 2); + b = *(pScr1 + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + pIn2 += 4u; + pScr1 += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = (srcALen + srcBLen - 1u) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + /* Read next two samples from scratch1 buffer */ + acc0 += (*pScr1++ * *pIn2++); + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1u; + + } + +} + + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q7.c new file mode 100644 index 000000000..7fe9f55cf --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q7.c @@ -0,0 +1,434 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_opt_q7.c +* +* Description: Convolution of Q7 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. + * + */ + +void arm_conv_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ + q15_t x4; /* Temporary input variable */ + q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ + uint32_t j, k, blkCnt, tapCnt; /* loop counter */ + q7_t *px; /* Temporary input1 pointer */ + q15_t *py; /* Temporary input2 pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3, y1; /* Temporary input variables */ + q7_t *pOut = pDst; /* output pointer */ + q7_t out0, out1, out2, out3; /* temporary variables */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2; + + /* points to smaller length sequence */ + px = pIn2 + srcBLen - 1; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy (srcALen) samples in scratch buffer */ + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update pointer */ + pScr1 += (srcBLen - 1u); + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1u) % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* Initialization of pIn2 pointer */ + pIn2 = (q7_t *) py; + + pScr2 = py; + + /* Actual convolution process starts here */ + blkCnt = (srcALen + srcBLen - 1u) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2 + 2u); + + acc0 = __SMLAD(x2, y1, acc0); + + acc2 = __SMLAD(x1, y1, acc2); + + acc1 = __SMLADX(x3, y1, acc1); + + x2 = *__SIMD32(pScr1)++; + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + pScr2 += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); + out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); + out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); + out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); + + *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = (srcALen + srcBLen - 1u) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + acc0 += (*pScr1++ * *pScr2++); + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1u; + + } + +} + + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_f32.c new file mode 100644 index 000000000..58f572735 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_f32.c @@ -0,0 +1,661 @@ +/* ---------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_f32.c +* +* Description: Partial convolution of floating-point sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup PartialConv Partial Convolution + * + * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated. + * Each function has two additional arguments. + * firstIndex specifies the starting index of the subset of output samples. + * numPoints is the number of output samples to compute. + * The function computes the output in the range + * [firstIndex, ..., firstIndex+numPoints-1]. + * The output array pDst contains numPoints values. + * + * The allowable range of output indices is [0 srcALen+srcBLen-2]. + * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR. + * Otherwise the functions return ARM_MATH_SUCCESS. + * \note Refer arm_conv_f32() for details on fixed point behavior. + * + * + * Fast Versions + * + * \par + * Fast versions are supported for Q31 and Q15 of partial convolution. Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires + * the input signals should be scaled down to avoid intermediate overflows. + * + * + * Opt Versions + * + * \par + * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. + * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of floating-point sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + +arm_status arm_conv_partial_f32( + float32_t * pSrcA, + uint32_t srcALen, + float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB; /* inputB pointer */ + float32_t *pOut = pDst; /* output pointer */ + float32_t *px; /* Intermediate inputA pointer */ + float32_t *py; /* Intermediate inputB pointer */ + float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ + float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count = 0u, blkCnt, check; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = (int32_t) check - (int32_t) srcALen; + blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; + blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = ((int32_t) check - blockSize3) - + (blockSize1 + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1u + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + firstIndex; + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* x[1] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* x[2] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* x[3] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc1; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2u); + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += x0 * c0; + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += x1 * c0; + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += x2 * c0; + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 += x3 * c0; + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += x1 * c0; + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += x2 * c0; + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += x3 * c0; + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 += x0 * c0; + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += x2 * c0; + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += x3 * c0; + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += x0 * c0; + /* acc3 += x[5] * y[srcBLen - 2] */ + acc3 += x1 * c0; + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 += x3 * c0; + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 += x0 * c0; + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 += x1 * c0; + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 += x2 * c0; + + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += x0 * c0; + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += x1 * c0; + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += x2 * c0; + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += x3 * c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + *pOut++ = acc1; + *pOut++ = acc2; + *pOut++ = acc3; + + /* Increment the pointer pIn1 index, count by 1 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + while(blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB; /* inputB pointer */ + float32_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0.0f; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0u; j <= i; j++) + { + /* Check the array limitations for inputs */ + if((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += pIn1[j] * pIn2[i - j]; + } + } + /* Store the output in the destination buffer */ + pDst[i] = sum; + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c new file mode 100644 index 000000000..25b3ba8c3 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c @@ -0,0 +1,763 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_fast_opt_q15.c +* +* Description: Fast Q15 Partial convolution. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + +arm_status arm_conv_partial_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; + + uint32_t tapCnt; /* loop count */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + + /* Copy smaller length input sequence in reverse order into second scratch buffer */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Assuming scratch1 buffer is aligned by 32-bit */ + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update pointer */ + pScr1 += (srcBLen - 1u); + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* First part of the processing with loop unrolling process 4 data points at a time. + ** a second loop below process for the remaining 1 to 3 samples. */ + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2u); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + acc1 = __SMLADX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2u); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + + /* update scratch pointers */ + pIn2 += 4u; + pScr1 += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read two samples from smaller buffer */ + y1 = *__SIMD32(pIn2)++; + + acc0 = __SMLAD(x1, y1, acc0); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1u; + + } + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + /* Return to application */ + return (status); +} + +#else + +arm_status arm_conv_partial_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; /* Status variable */ + uint32_t tapCnt; /* loop count */ + q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ + q15_t y10, y11; /* Temporary variables to hold srcB buffer */ + + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1u) % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read next two samples from scratch1 buffer */ + x20 = *pScr1++; + x21 = *pScr1++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read two samples from smaller buffer */ + y10 = *pIn2; + y11 = *(pIn2 + 1u); + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + acc2 += (q31_t) x20 *y10; + acc2 += (q31_t) x21 *y11; + + /* multiply and accumlate */ + acc1 += (q31_t) x11 *y10; + acc1 += (q31_t) x20 *y11; + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1; + x11 = *(pScr1 + 1u); + + /* multiply and accumlate */ + acc3 += (q31_t) x21 *y10; + acc3 += (q31_t) x10 *y11; + + /* Read next two samples from scratch2 buffer */ + y10 = *(pIn2 + 2u); + y11 = *(pIn2 + 3u); + + /* multiply and accumlate */ + acc0 += (q31_t) x20 *y10; + acc0 += (q31_t) x21 *y11; + acc2 += (q31_t) x10 *y10; + acc2 += (q31_t) x11 *y11; + acc1 += (q31_t) x21 *y10; + acc1 += (q31_t) x10 *y11; + + /* Read next two samples from scratch1 buffer */ + x20 = *(pScr1 + 2); + x21 = *(pScr1 + 3); + + /* multiply and accumlate */ + acc3 += (q31_t) x11 *y10; + acc3 += (q31_t) x20 *y11; + + /* update scratch pointers */ + pIn2 += 4u; + pScr1 += 4u; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = __SSAT((acc0 >> 15), 16); + *pOut++ = __SSAT((acc1 >> 15), 16); + *pOut++ = __SSAT((acc2 >> 15), 16); + *pOut++ = __SSAT((acc3 >> 15), 16); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read two samples from smaller buffer */ + y10 = *pIn2++; + y11 = *pIn2++; + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1u; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); +} + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c new file mode 100644 index 000000000..98216bb6a --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c @@ -0,0 +1,1473 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_fast_q15.c +* +* Description: Fast Q15 Partial convolution. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + */ + + +arm_status arm_conv_partial_fast_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ +#ifndef UNALIGNED_SUPPORT_DISABLE + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >=srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t) check - (int32_t) srcALen); + blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1u + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while((count < 4u) && (blockSize1 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while(blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2 - 1u; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0u; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2u); + + while(blkCnt > 0u) + { + py = py - 1u; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2u; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4u; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); + *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16); + *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + pIn2 = pSrc2 - 1u; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = count >> 2u; + + while((j > 0u) && (blockSize3 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1u; + + while(blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + q15_t a, b; + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >=srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t) check - (int32_t) srcALen); + blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1u + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while((count < 4u) && (blockSize1 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2 - 1u; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0u; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2u); + + while(blkCnt > 0u) + { + py = py - 1u; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + a = *px++; + b = *px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *px; + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *px; + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x2 = __PKHBT(a, b, 16); + a = *(px + 2); + x3 = __PKHBT(b, a, 16); + +#else + + x2 = __PKHBT(b, a, 16); + a = *(px + 2); + x3 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5], x[6] */ + a = *(px + 2); + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 4); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 4); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 4u; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + a = *px; + b = *(px+1); + px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + /* Read y[srcBLen - 7] */ + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + a = *(px+2); + b = *(px+3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q15_t)(acc0 >> 15); + *pOut++ = (q15_t)(acc1 >> 15); + *pOut++ = (q15_t)(acc2 >> 15); + *pOut++ = (q15_t)(acc3 >> 15); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + pIn2 = pSrc2 - 1u; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = count >> 2u; + + while((j > 0u) && (blockSize3 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + /* Decrement the loop counter */ + k--; + } + + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1u; + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ +} + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q31.c new file mode 100644 index 000000000..17902f593 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q31.c @@ -0,0 +1,599 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_fast_q31.c +* +* Description: Fast Q31 Partial convolution. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par + * See arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. + */ + +arm_status arm_conv_partial_fast_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t x0, x1, x2, x3, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t) check - (int32_t) srcALen); + blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1u + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while(blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[1] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[2] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[3] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2 */ + blkCnt = ((uint32_t) blockSize2 >> 2u); + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc3 += x[5] * y[srcBLen - 2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); + + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (acc0 << 1); + *pOut++ = (q31_t) (acc1 << 1); + *pOut++ = (q31_t) (acc2 << 1); + *pOut++ = (q31_t) (acc3 << 1); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +} + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q15.c new file mode 100644 index 000000000..fe14f9fd3 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q15.c @@ -0,0 +1,764 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_opt_q15.c +* +* Description: Partial convolution of Q15 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, state buffers should be aligned by 32-bit + * + * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * + */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + +arm_status arm_conv_partial_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q63_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; /* Status variable */ + uint32_t tapCnt; /* loop count */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update pointer */ + pScr1 += (srcBLen - 1u); + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2u); + + /* multiply and accumlate */ + acc0 = __SMLALD(x1, y1, acc0); + acc2 = __SMLALD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLALDX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLALD(x2, y2, acc0); + acc2 = __SMLALD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + acc1 = __SMLALDX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2u); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + + /* update scratch pointers */ + pIn2 += 4u; + pScr1 += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read two samples from smaller buffer */ + y1 = *__SIMD32(pIn2)++; + + acc0 = __SMLALD(x1, y1, acc0); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1u; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); +} + +#else + +arm_status arm_conv_partial_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q63_t acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; /* Status variable */ + uint32_t tapCnt; /* loop count */ + q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ + q15_t y10, y11; /* Temporary variables to hold srcB buffer */ + + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1u) % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read next two samples from scratch1 buffer */ + x20 = *pScr1++; + x21 = *pScr1++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read two samples from smaller buffer */ + y10 = *pIn2; + y11 = *(pIn2 + 1u); + + /* multiply and accumlate */ + acc0 += (q63_t) x10 *y10; + acc0 += (q63_t) x11 *y11; + acc2 += (q63_t) x20 *y10; + acc2 += (q63_t) x21 *y11; + + /* multiply and accumlate */ + acc1 += (q63_t) x11 *y10; + acc1 += (q63_t) x20 *y11; + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1; + x11 = *(pScr1 + 1u); + + /* multiply and accumlate */ + acc3 += (q63_t) x21 *y10; + acc3 += (q63_t) x10 *y11; + + /* Read next two samples from scratch2 buffer */ + y10 = *(pIn2 + 2u); + y11 = *(pIn2 + 3u); + + /* multiply and accumlate */ + acc0 += (q63_t) x20 *y10; + acc0 += (q63_t) x21 *y11; + acc2 += (q63_t) x10 *y10; + acc2 += (q63_t) x11 *y11; + acc1 += (q63_t) x21 *y10; + acc1 += (q63_t) x10 *y11; + + /* Read next two samples from scratch1 buffer */ + x20 = *(pScr1 + 2); + x21 = *(pScr1 + 3); + + /* multiply and accumlate */ + acc3 += (q63_t) x11 *y10; + acc3 += (q63_t) x20 *y11; + + /* update scratch pointers */ + pIn2 += 4u; + pScr1 += 4u; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = __SSAT((acc0 >> 15), 16); + *pOut++ = __SSAT((acc1 >> 15), 16); + *pOut++ = __SSAT((acc2 >> 15), 16); + *pOut++ = __SSAT((acc3 >> 15), 16); + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read two samples from smaller buffer */ + y10 = *pIn2++; + y11 = *pIn2++; + + /* multiply and accumlate */ + acc0 += (q63_t) x10 *y10; + acc0 += (q63_t) x11 *y11; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1u; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); +} + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c new file mode 100644 index 000000000..513d89d87 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c @@ -0,0 +1,806 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_opt_q7.c +* +* Description: Partial convolution of Q7 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * + * + */ + + +#ifndef UNALIGNED_SUPPORT_DISABLE + +arm_status arm_conv_partial_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ + q15_t x4; /* Temporary input variable */ + q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ + uint32_t j, k, blkCnt, tapCnt; /* loop counter */ + q7_t *px; /* Temporary input1 pointer */ + q15_t *py; /* Temporary input2 pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3, y1; /* Temporary input variables */ + arm_status status; + q7_t *pOut = pDst; /* output pointer */ + q7_t out0, out1, out2, out3; /* temporary variables */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2; + + /* points to smaller length sequence */ + px = pIn2 + srcBLen - 1; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy (srcALen) samples in scratch buffer */ + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update pointer */ + pScr1 += (srcBLen - 1u); + + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* Initialization of pIn2 pointer */ + pIn2 = (q7_t *) py; + + pScr2 = py; + + pOut = pDst + firstIndex; + + pScratch1 += firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2 + 2u); + + acc0 = __SMLAD(x2, y1, acc0); + + acc2 = __SMLAD(x1, y1, acc2); + + acc1 = __SMLADX(x3, y1, acc1); + + x2 = *__SIMD32(pScr1)++; + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + pScr2 += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); + out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); + out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); + out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); + + *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4u; + + } + + blkCnt = (numPoints) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read two samples from smaller buffer */ + y1 = *__SIMD32(pScr2)++; + + acc0 = __SMLAD(x1, y1, acc0); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1u; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + + } + + return (status); + +} + +#else + +arm_status arm_conv_partial_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ + q15_t x4; /* Temporary input variable */ + q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ + uint32_t j, k, blkCnt, tapCnt; /* loop counter */ + q7_t *px; /* Temporary input1 pointer */ + q15_t *py; /* Temporary input2 pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + arm_status status; + q7_t *pOut = pDst; /* output pointer */ + q15_t x10, x11, x20, x21; /* Temporary input variables */ + q15_t y10, y11; /* Temporary input variables */ + q7_t out0, out1, out2, out3; /* temporary variables */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2; + + /* points to smaller length sequence */ + px = pIn2 + srcBLen - 1; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy (srcALen) samples in scratch buffer */ + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1u) % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* Initialization of pIn2 pointer */ + pIn2 = (q7_t *) py; + + pScr2 = py; + + pOut = pDst + firstIndex; + + pScratch1 += firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read next two samples from scratch1 buffer */ + x20 = *pScr1++; + x21 = *pScr1++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read four samples from smaller buffer */ + y10 = *pScr2; + y11 = *(pScr2 + 1u); + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + acc2 += (q31_t) x20 *y10; + acc2 += (q31_t) x21 *y11; + + + acc1 += (q31_t) x11 *y10; + acc1 += (q31_t) x20 *y11; + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1; + x11 = *(pScr1 + 1u); + + /* multiply and accumlate */ + acc3 += (q31_t) x21 *y10; + acc3 += (q31_t) x10 *y11; + + /* Read next two samples from scratch2 buffer */ + y10 = *(pScr2 + 2u); + y11 = *(pScr2 + 3u); + + /* multiply and accumlate */ + acc0 += (q31_t) x20 *y10; + acc0 += (q31_t) x21 *y11; + acc2 += (q31_t) x10 *y10; + acc2 += (q31_t) x11 *y11; + acc1 += (q31_t) x21 *y10; + acc1 += (q31_t) x10 *y11; + + /* Read next two samples from scratch1 buffer */ + x20 = *(pScr1 + 2); + x21 = *(pScr1 + 3); + + /* multiply and accumlate */ + acc3 += (q31_t) x11 *y10; + acc3 += (q31_t) x20 *y11; + + /* update scratch pointers */ + + pScr1 += 4u; + pScr2 += 4u; + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); + out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); + out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); + out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); + + + *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4u; + + } + + blkCnt = (numPoints) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read two samples from smaller buffer */ + y10 = *pScr2++; + y11 = *pScr2++; + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1u; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + return (status); + +} + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c new file mode 100644 index 000000000..8c7ed5f86 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c @@ -0,0 +1,778 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_q15.c +* +* Description: Partial convolution of Q15 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * \par + * Refer the function arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers. + * + */ + + +arm_status arm_conv_partial_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + +#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary input variables */ + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t) check - (int32_t) srcALen); + blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1u + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while((count < 4u) && (blockSize1 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while(blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2 - 1u; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0u; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + py = py - 1u; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2u; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLALDX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLALDX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLALDX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLALDX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLALDX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4u; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLALDX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALDX(x1, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + + c0 = *(py-1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x1, c0, acc0); + acc1 = __SMLALD(x2, c0, acc1); + acc2 = __SMLALDX(x2, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + pIn2 = pSrc2 - 1u; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = count >> 2u; + + while((j > 0u) && (blockSize3 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1u; + + while(blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + q15_t *pIn1 = pSrcA; /* inputA pointer */ + q15_t *pIn2 = pSrcB; /* inputB pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q31_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c new file mode 100644 index 000000000..16a060648 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c @@ -0,0 +1,599 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_q31.c +* +* Description: Partial convolution of Q31 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q31 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * See arm_conv_partial_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +arm_status arm_conv_partial_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q63_t sum, acc0, acc1, acc2; /* Accumulator */ + q31_t x0, x1, x2, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ + arm_status status; /* status of Partial convolution */ + + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t) check - (int32_t) srcALen); + blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1u + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while(blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* x[1] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* x[2] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* x[3] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blkCnt */ + + blkCnt = blockSize2 / 3; + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* read x[0], x[1] samples */ + x0 = *(px++); + x1 = *(px++); + + /* Apply loop unrolling and compute 3 MACs simultaneously. */ + k = srcBLen / 3; + + /* First part of the processing with loop unrolling. Compute 3 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 2 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py); + + /* Read x[2] sample */ + x2 = *(px); + + /* Perform the multiply-accumulates */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += (q63_t) x0 *c0; + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += (q63_t) x1 *c0; + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += (q63_t) x2 *c0; + + /* Read y[srcBLen - 2] sample */ + c0 = *(py - 1u); + + /* Read x[3] sample */ + x0 = *(px + 1u); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += (q63_t) x1 *c0; + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += (q63_t) x2 *c0; + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += (q63_t) x0 *c0; + + /* Read y[srcBLen - 3] sample */ + c0 = *(py - 2u); + + /* Read x[4] sample */ + x1 = *(px + 2u); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += (q63_t) x2 *c0; + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += (q63_t) x0 *c0; + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += (q63_t) x1 *c0; + + + px += 3u; + + py -= 3u; + + } while(--k); + + /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen - (3 * (srcBLen / 3)); + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += (q63_t) x0 *c0; + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += (q63_t) x1 *c0; + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += (q63_t) x2 *c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (acc0 >> 31); + *pOut++ = (q31_t) (acc1 >> 31); + *pOut++ = (q31_t) (acc2 >> 31); + + /* Increment the pointer pIn1 index, count by 3 */ + count += 3u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 - 3 * (blockSize2 / 3); + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t *pIn1 = pSrcA; /* inputA pointer */ + q31_t *pIn2 = pSrcB; /* inputB pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q63_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q31_t) (sum >> 31u); + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c new file mode 100644 index 000000000..ed205838e --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c @@ -0,0 +1,733 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_partial_q7.c +* +* Description: Partial convolution of Q7 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par + * Refer the function arm_conv_partial_opt_q7() for a faster implementation of this function. + * + */ + +arm_status arm_conv_partial_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q7_t *pOut = pDst; /* output pointer */ + q7_t *px; /* Intermediate inputA pointer */ + q7_t *py; /* Intermediate inputB pointer */ + q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t input1, input2; + q15_t in1, in2; + q7_t x0, x1, x2, x3, c0, c1; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ + arm_status status; + + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t) check - (int32_t) srcALen); + blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1u + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] , x[1] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 1] , y[srcBLen - 2] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[0] * y[srcBLen - 1] */ + /* x[1] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* x[2] , x[3] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 3] , y[srcBLen - 4] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[2] * y[srcBLen - 3] */ + /* x[3] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2u); + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + /* Read y[srcBLen - 2] sample */ + c1 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* x[0] and x[1] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[1] and x[2] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[4] sample */ + x0 = *(px++); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLAD(input1, input2, acc3); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + /* Read y[srcBLen - 4] sample */ + c1 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[4] and x[5] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[6] sample */ + x2 = *(px++); + + /* x[5] and x[6] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLAD(input1, input2, acc3); + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q31_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q31_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q31_t) x2 * c0); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += ((q31_t) x3 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); + *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); + *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); + *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + q7_t *pIn1 = pSrcA; /* inputA pointer */ + q7_t *pIn2 = pSrcB; /* inputB pointer */ + q31_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q15_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q15.c new file mode 100644 index 000000000..1907719e7 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q15.c @@ -0,0 +1,733 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_q15.c +* +* Description: Convolution of Q15 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * \par + * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * \par + * Refer the function arm_conv_opt_q15() for a faster implementation of this function using scratch buffers. + * + */ + +void arm_conv_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ + +#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while((count < 4u) && (blockSize1 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + (count - 1u); + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0u; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + py = py - 1u; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2u; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLALDX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLALDX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLALDX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLALDX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLALDX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4u; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLALDX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALDX(x1, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + + c0 = *(py-1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x1, c0, acc0); + acc1 = __SMLALD(x2, c0, acc1); + acc2 = __SMLALDX(x2, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + blockSize3 = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + pIn2 = pSrc2 - 1u; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = blockSize3 >> 2u; + + while((j > 0u) && (blockSize3 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1u; + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q15_t *pIn1 = pSrcA; /* input pointer */ + q15_t *pIn2 = pSrcB; /* coefficient pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += (q31_t) pIn1[j] * (pIn2[i - j]); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); + } + +#endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/ + +} + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q31.c new file mode 100644 index 000000000..769b95abb --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q31.c @@ -0,0 +1,564 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_q31.c +* +* Description: Convolution of Q31 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q31 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * See arm_conv_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +void arm_conv_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q63_t sum; /* Accumulator */ + q63_t acc0, acc1, acc2; /* Accumulator */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (q31_t *) pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = (q31_t *) pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* x[1] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* x[2] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* x[3] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll by 3 */ + blkCnt = blockSize2 / 3; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + + /* Apply loop unrolling and compute 3 MACs simultaneously. */ + k = srcBLen / 3; + + /* First part of the processing with loop unrolling. Compute 3 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 2 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py); + + /* Read x[3] sample */ + x2 = *(px); + + /* Perform the multiply-accumulates */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += ((q63_t) x2 * c0); + + /* Read y[srcBLen - 2] sample */ + c0 = *(py - 1u); + + /* Read x[4] sample */ + x0 = *(px + 1u); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += ((q63_t) x1 * c0); + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += ((q63_t) x2 * c0); + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += ((q63_t) x0 * c0); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py - 2u); + + /* Read x[5] sample */ + x1 = *(px + 2u); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += ((q63_t) x2 * c0); + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += ((q63_t) x0 * c0); + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += ((q63_t) x1 * c0); + + /* update scratch pointers */ + px += 3u; + py -= 3u; + + } while(--k); + + /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen - (3 * (srcBLen / 3)); + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + k--; + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q31_t) (acc0 >> 31); + *pOut++ = (q31_t) (acc1 >> 31); + *pOut++ = (q31_t) (acc2 >> 31); + + /* Increment the pointer pIn1 index, count by 3 */ + count += 3u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 - 3 * (blockSize2 / 3); + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t *pIn1 = pSrcA; /* input pointer */ + q31_t *pIn2 = pSrcB; /* coefficient pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q63_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q31_t) (sum >> 31u); + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c new file mode 100644 index 000000000..eb78fd533 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c @@ -0,0 +1,689 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_conv_q7.c +* +* Description: Convolution of Q7 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. + * + * \par + * Refer the function arm_conv_opt_q7() for a faster implementation of this function. + * + */ + +void arm_conv_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q7_t *pOut = pDst; /* output pointer */ + q7_t *px; /* Intermediate inputA pointer */ + q7_t *py; /* Intermediate inputB pointer */ + q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t input1, input2; /* Temporary input variables */ + q15_t in1, in2; /* Temporary input variables */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = (srcALen - srcBLen) + 1u; + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] , x[1] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 1] , y[srcBLen - 2] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* x[0] * y[srcBLen - 1] */ + /* x[1] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* x[2] , x[3] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 3] , y[srcBLen - 4] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* x[2] * y[srcBLen - 3] */ + /* x[3] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + /* Read y[srcBLen - 2] sample */ + c1 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* x[0] and x[1] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[1] and x[2] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[4] sample */ + x0 = *(px++); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLAD(input1, input2, acc3); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + /* Read y[srcBLen - 4] sample */ + c1 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[4] and x[5] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[6] sample */ + x2 = *(px++); + + /* x[5] and x[6] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLAD(input1, input2, acc3); + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q15_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q15_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q15_t) x2 * c0); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); + *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); + *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); + *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q7_t *pIn1 = pSrcA; /* input pointer */ + q7_t *pIn2 = pSrcB; /* coefficient pointer */ + q31_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += (q15_t) pIn1[j] * (pIn2[i - j]); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of Conv group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c new file mode 100644 index 000000000..6a99eafc1 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c @@ -0,0 +1,738 @@ +/* ---------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_f32.c +* +* Description: Correlation of floating-point sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup Corr Correlation + * + * Correlation is a mathematical operation that is similar to convolution. + * As with convolution, correlation uses two signals to produce a third signal. + * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution. + * Correlation is commonly used to measure the similarity between two signals. + * It has applications in pattern recognition, cryptanalysis, and searching. + * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types. + * Fast versions of the Q15 and Q31 functions are also provided. + * + * \par Algorithm + * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively. + * The convolution of the two signals is denoted by + *
    
+ *                   c[n] = a[n] * b[n]    
+ * 
+ * In correlation, one of the signals is flipped in time + *
    
+ *                   c[n] = a[n] * b[-n]    
+ * 
+ * + * \par + * and this is mathematically defined as + * \image html CorrelateEquation.gif + * \par + * The pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen. + * The result c[n] is of length 2 * max(srcALen, srcBLen) - 1 and is defined over the interval n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2). + * The output result is written to pDst and the calling function must allocate 2 * max(srcALen, srcBLen) - 1 words for the result. + * + * Note + * \par + * The pDst should be initialized to all zeros before being used. + * + * Fixed-Point Behavior + * \par + * Correlation requires summing up a large number of intermediate products. + * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. + * Refer to the function specific documentation below for further details of the particular algorithm used. + * + * + * Fast Versions + * + * \par + * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires + * the input signals should be scaled down to avoid intermediate overflows. + * + * + * Opt Versions + * + * \par + * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. + * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate + */ + +/** + * @addtogroup Corr + * @{ + */ +/** + * @brief Correlation of floating-point sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + */ + +void arm_correlate_f32( + float32_t * pSrcA, + uint32_t srcALen, + float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t *pIn1; /* inputA pointer */ + float32_t *pIn2; /* inputB pointer */ + float32_t *pOut = pDst; /* output pointer */ + float32_t *px; /* Intermediate inputA pointer */ + float32_t *py; /* Intermediate inputB pointer */ + float32_t *pSrc1; /* Intermediate pointers */ + float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding has to be done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + //while(j > 0u) + //{ + // /* Zero is stored in the destination buffer */ + // *pOut++ = 0.0f; + + // /* Decrement the loop counter */ + // j--; + //} + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1u); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 4] */ + sum += *px++ * *py++; + /* x[1] * y[srcBLen - 3] */ + sum += *px++ * *py++; + /* x[2] * y[srcBLen - 2] */ + sum += *px++ * *py++; + /* x[3] * y[srcBLen - 1] */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[0] sample */ + c0 = *(py++); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[0] */ + acc0 += x0 * c0; + /* acc1 += x[1] * y[0] */ + acc1 += x1 * c0; + /* acc2 += x[2] * y[0] */ + acc2 += x2 * c0; + /* acc3 += x[3] * y[0] */ + acc3 += x3 * c0; + + /* Read y[1] sample */ + c0 = *(py++); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[1] */ + acc0 += x1 * c0; + /* acc1 += x[2] * y[1] */ + acc1 += x2 * c0; + /* acc2 += x[3] * y[1] */ + acc2 += x3 * c0; + /* acc3 += x[4] * y[1] */ + acc3 += x0 * c0; + + /* Read y[2] sample */ + c0 = *(py++); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[2] */ + acc0 += x2 * c0; + /* acc1 += x[3] * y[2] */ + acc1 += x3 * c0; + /* acc2 += x[4] * y[2] */ + acc2 += x0 * c0; + /* acc3 += x[5] * y[2] */ + acc3 += x1 * c0; + + /* Read y[3] sample */ + c0 = *(py++); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[3] */ + acc0 += x3 * c0; + /* acc1 += x[4] * y[3] */ + acc1 += x0 * c0; + /* acc2 += x[5] * y[3] */ + acc2 += x1 * c0; + /* acc3 += x[6] * y[3] */ + acc3 += x2 * c0; + + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[4] sample */ + c0 = *(py++); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 += x0 * c0; + /* acc1 += x[5] * y[4] */ + acc1 += x1 * c0; + /* acc2 += x[6] * y[4] */ + acc2 += x2 * c0; + /* acc3 += x[7] * y[4] */ + acc3 += x3 * c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = acc0; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = acc1; + pOut += inc; + + *pOut = acc2; + pOut += inc; + + *pOut = acc3; + pOut += inc; + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py++; + sum += *px++ * *py++; + sum += *px++ * *py++; + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Loop over srcBLen */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum += *px++ * *py++; + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + sum += *px++ * *py++; + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum += *px++ * *py++; + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ + float32_t sum; /* Accumulator */ + uint32_t i = 0u, j; /* loop counters */ + uint32_t inv = 0u; /* Reverse order flag */ + uint32_t tot = 0u; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we include zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using convolution but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2u); + + if(srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if(srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1u); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate convolution for output length number of times */ + for (i = 0u; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0.0f; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0u; j <= i; j++) + { + /* Check the array limitations */ + if((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += pIn1[j] * pIn2[-((int32_t) i - j)]; + } + } + /* Store the output in the destination buffer */ + if(inv == 1) + *pDst-- = sum; + else + *pDst++ = sum; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c new file mode 100644 index 000000000..a99225a2f --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c @@ -0,0 +1,507 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_fast_opt_q15.c +* +* Description: Fast Q15 Correlation. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @return none. + * + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch buffers should be aligned by 32-bit + * + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + */ + +void arm_correlate_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch) +{ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *py; /* Intermediate inputB pointer */ + q31_t x1, x2, x3; /* temporary variables for holding input and coefficient values */ + uint32_t j, blkCnt, outBlockSize; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + uint32_t tapCnt; + q31_t y1, y2; + q15_t *pScr; /* Intermediate pointers */ + q15_t *pOut = pDst; /* output pointer */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + pScr = pScratch; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr += (srcBLen - 1u); + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr, srcALen); + + /* Update pointers */ + pScr += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + j = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(j > 0u) + { + /* copy second buffer in reversal manner */ + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = srcALen % 0x4u; + + while(j > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1u)); + + /* Update pointer */ + pScr += (srcBLen - 1u); + +#else + +/* Apply loop unrolling and do 4 Copies simultaneously. */ + j = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(j > 0u) + { + /* copy second buffer in reversal manner */ + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = (srcBLen - 1u) % 0x4u; + + while(j > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for scratch2 */ + py = pIn2; + + + /* Actual correlation process starts here */ + blkCnt = (srcALen + srcBLen - 1u) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read four samples from scratch1 buffer */ + x1 = *__SIMD32(pScr)++; + + /* Read next four samples from scratch1 buffer */ + x2 = *__SIMD32(pScr)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2u); + + acc0 = __SMLAD(x1, y1, acc0); + + acc2 = __SMLAD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLADX(x3, y1, acc1); + + x1 = _SIMD32_OFFSET(pScr); + + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + acc1 = __SMLADX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr + 2u); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x1, y1, acc0); + + acc2 = __SMLAD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLADX(x3, y1, acc1); + + a = *pScr; + b = *(pScr + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + acc1 = __SMLADX(x3, y2, acc1); + + a = *(pScr + 2); + b = *(pScr + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + pIn2 += 4u; + + pScr += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr -= 4u; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2); + acc1 += (*pScr++ * *pIn2); + acc2 += (*pScr++ * *pIn2); + acc3 += (*pScr++ * *pIn2++); + + pScr -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut = (__SSAT(acc0 >> 15u, 16)); + pOut += inc; + *pOut = (__SSAT(acc1 >> 15u, 16)); + pOut += inc; + *pOut = (__SSAT(acc2 >> 15u, 16)); + pOut += inc; + *pOut = (__SSAT(acc3 >> 15u, 16)); + pOut += inc; + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 4u; + + } + + + blkCnt = (srcALen + srcBLen - 1u) & 0x3; + + /* Calculate correlation for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + acc0 += (*pScr++ * *pIn2++); + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + + *pOut = (q15_t) (__SSAT((acc0 >> 15), 16)); + + pOut += inc; + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 1u; + + } +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c new file mode 100644 index 000000000..0c79bc896 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c @@ -0,0 +1,1314 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_fast_q15.c +* +* Description: Fast Q15 Correlation. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + */ + +void arm_correlate_fast_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ +#ifndef UNALIGNED_SUPPORT_DISABLE + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1u); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum = __SMLAD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px + 1); + px += 2u; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the first two inputB samples using SIMD: + * y[0] and y[1] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLAD(x0, c0, acc0); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLAD(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px + 1); + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLAD(x2, c0, acc2); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLAD(x3, c0, acc3); + + /* Read y[2] and y[3] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLAD(x2, c0, acc0); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLAD(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px + 2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px + 3); + px += 4u; + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLAD(x0, c0, acc2); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLAD(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[4] */ + c0 = *py; +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py)++; + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + + c0 = (*py); + /* Read y[6] */ +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px + 2); + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (acc0 >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q15_t) (acc1 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc2 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc3 >> 15); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + q15_t a, b; + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1u); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 2); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 2); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 2u; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the first two inputB samples using SIMD: + * y[0] and y[1] */ + a = *py; + b = *(py + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLAD(x0, c0, acc0); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLAD(x1, c0, acc1); + + /* Read x[2], x[3], x[4] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x2 = __PKHBT(a, b, 16); + a = *(px + 2); + x3 = __PKHBT(b, a, 16); + +#else + + x2 = __PKHBT(b, a, 16); + a = *(px + 2); + x3 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLAD(x2, c0, acc2); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLAD(x3, c0, acc3); + + /* Read y[2] and y[3] */ + a = *(py + 2); + b = *(py + 3); + + py += 4u; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLAD(x2, c0, acc0); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLAD(x3, c0, acc1); + + /* Read x[4], x[5], x[6] */ + a = *(px + 2); + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 4); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 4); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 4u; + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLAD(x0, c0, acc2); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLAD(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[4] */ + c0 = *py; +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + a = *px; + b = *(px + 1); + + px++;; + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[4], y[5] */ + a = *py; + b = *(py + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[4], y[5] */ + a = *py; + b = *(py + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + py += 2u; + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + + c0 = (*py); + /* Read y[6] */ +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (acc0 >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q15_t) (acc1 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc2 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc3 >> 15); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c new file mode 100644 index 000000000..628fd3ea5 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c @@ -0,0 +1,607 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_fast_q31.c +* +* Description: Fast Q31 Correlation. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are accumulated in a 32-bit register in 2.30 format. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * + * \par + * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signals must be scaled down. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * + * \par + * See arm_correlate_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. + */ + +void arm_correlate_fast_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1u); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* x[1] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* x[2] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* x[3] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[0] sample */ + c0 = *(py++); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[0] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[1] * y[0] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[2] * y[0] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[3] * y[0] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Read y[1] sample */ + c0 = *(py++); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[1] * y[1] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc1 += x[2] * y[1] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc2 += x[3] * y[1] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc3 += x[4] * y[1] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read y[2] sample */ + c0 = *(py++); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[2] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc1 += x[3] * y[2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc2 += x[4] * y[2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc3 += x[5] * y[2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read y[3] sample */ + c0 = *(py++); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[3] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc1 += x[4] * y[3] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc2 += x[5] * y[3] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc3 += x[6] * y[3] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); + + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[4] sample */ + c0 = *(py++); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[5] * y[4] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[6] * y[4] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[7] * y[4] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (acc0 << 1); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q31_t) (acc1 << 1); + pOut += inc; + + *pOut = (q31_t) (acc2 << 1); + pOut += inc; + + *pOut = (q31_t) (acc3 << 1); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u; + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q15.c new file mode 100644 index 000000000..cc33b54f5 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q15.c @@ -0,0 +1,512 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_opt_q15.c +* +* Description: Correlation of Q15 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch buffers should be aligned by 32-bit + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * \par + * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * + */ + + +void arm_correlate_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch) +{ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *py; /* Intermediate inputB pointer */ + q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */ + uint32_t j, blkCnt, outBlockSize; /* loop counter */ + int32_t inc = 1; /* output pointer increment */ + uint32_t tapCnt; + q31_t y1, y2; + q15_t *pScr; /* Intermediate pointers */ + q15_t *pOut = pDst; /* output pointer */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + pScr = pScratch; + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr += (srcBLen - 1u); + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr, srcALen); + + /* Update pointers */ + //pIn1 += srcALen; + pScr += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + j = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(j > 0u) + { + /* copy second buffer in reversal manner */ + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = srcALen % 0x4u; + + while(j > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1u)); + + /* Update pointer */ + pScr += (srcBLen - 1u); + +#else + +/* Apply loop unrolling and do 4 Copies simultaneously. */ + j = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(j > 0u) + { + /* copy second buffer in reversal manner */ + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = (srcBLen - 1u) % 0x4u; + + while(j > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for scratch2 */ + py = pIn2; + + + /* Actual correlation process starts here */ + blkCnt = (srcALen + srcBLen - 1u) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read four samples from scratch1 buffer */ + x1 = *__SIMD32(pScr)++; + + /* Read next four samples from scratch1 buffer */ + x2 = *__SIMD32(pScr)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2u); + + acc0 = __SMLALD(x1, y1, acc0); + + acc2 = __SMLALD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLALDX(x3, y1, acc1); + + x1 = _SIMD32_OFFSET(pScr); + + acc0 = __SMLALD(x2, y2, acc0); + + acc2 = __SMLALD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + + acc1 = __SMLALDX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr + 2u); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x1, y1, acc0); + + acc2 = __SMLALD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLALDX(x3, y1, acc1); + + a = *pScr; + b = *(pScr + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x2, y2, acc0); + + acc2 = __SMLALD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + + acc1 = __SMLALDX(x3, y2, acc1); + + a = *(pScr + 2); + b = *(pScr + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + pIn2 += 4u; + + pScr += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr -= 4u; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2); + acc1 += (*pScr++ * *pIn2); + acc2 += (*pScr++ * *pIn2); + acc3 += (*pScr++ * *pIn2++); + + pScr -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut = (__SSAT(acc0 >> 15u, 16)); + pOut += inc; + *pOut = (__SSAT(acc1 >> 15u, 16)); + pOut += inc; + *pOut = (__SSAT(acc2 >> 15u, 16)); + pOut += inc; + *pOut = (__SSAT(acc3 >> 15u, 16)); + pOut += inc; + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 4u; + + } + + + blkCnt = (srcALen + srcBLen - 1u) & 0x3; + + /* Calculate correlation for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + + acc0 += (*pScr++ * *pIn2++); + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT((acc0 >> 15), 16)); + + pOut += inc; + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 1u; + + } + + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q7.c new file mode 100644 index 000000000..6749b01b3 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q7.c @@ -0,0 +1,463 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_opt_q7.c +* +* Description: Correlation of Q7 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return none. + * + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. + * + * + */ + + + +void arm_correlate_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + q7_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */ + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t j, k = 0u, blkCnt; /* loop counter */ + int32_t inc = 1; /* output pointer increment */ + uint32_t outBlockSize; /* loop counter */ + q15_t x4; /* Temporary input variable */ + uint32_t tapCnt; /* loop counter */ + q31_t x1, x2, x3, y1; /* Temporary input variables */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + + /* Copy (srcBLen) samples in scratch buffer */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Fill (srcBLen - 1u) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1u); + + /* Copy (srcALen) samples in scratch buffer */ + k = srcALen >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1u)); + + /* Update pointer */ + pScr1 += (srcBLen - 1u); + +#else + +/* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1u) >> 2u; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1u) % 0x4u; + + while(k > 0u) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for second sequence */ + py = pScratch2; + + /* Initialization of pScr2 pointer */ + pScr2 = pScratch2; + + /* Actual correlation process starts here */ + blkCnt = (srcALen + srcBLen - 1u) >> 2; + + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2u; + + while(tapCnt > 0u) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2 + 2u); + + acc0 = __SMLAD(x2, y1, acc0); + + acc2 = __SMLAD(x1, y1, acc2); + + acc1 = __SMLADX(x3, y1, acc1); + + x2 = *__SIMD32(pScr1)++; + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + pScr2 += 4u; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4u; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3u; + + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3u; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8)); + pOut += inc; + *pOut = (q7_t) (__SSAT(acc1 >> 7u, 8)); + pOut += inc; + *pOut = (q7_t) (__SSAT(acc2 >> 7u, 8)); + pOut += inc; + *pOut = (q7_t) (__SSAT(acc3 >> 7u, 8)); + pOut += inc; + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4u; + + } + + + blkCnt = (srcALen + srcBLen - 1u) & 0x3; + + /* Calculate correlation for remaining samples of Bigger length sequence */ + while(blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1u; + + while(tapCnt > 0u) + { + acc0 += (*pScr1++ * *pScr2++); + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1u; + + /* apply same above for remaining samples of smaller length sequence */ + while(tapCnt > 0u) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8)); + + pOut += inc; + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1u; + + } + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c new file mode 100644 index 000000000..fc4134b7b --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c @@ -0,0 +1,718 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_q15.c +* +* Description: Correlation of Q15 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * \par + * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * \par + * Refer the function arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers. + * + */ + +void arm_correlate_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ + +#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1u); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum = __SMLALD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT((sum >> 15), 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px + 1); + px += 2u; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the first two inputB samples using SIMD: + * y[0] and y[1] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLALD(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px + 1); + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLALD(x3, c0, acc3); + + /* Read y[2] and y[3] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLALD(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px + 2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px + 3); + + px += 4u; + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLALD(x1, c0, acc3); + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[4] */ + c0 = *py; +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALDX(x1, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALD(x3, c0, acc2); + acc3 = __SMLALD(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py)++; + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALD(x3, c0, acc2); + acc3 = __SMLALD(x2, c0, acc3); + + c0 = (*py); + + /* Read y[6] */ +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px + 2); + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x1, c0, acc0); + acc1 = __SMLALD(x2, c0, acc1); + acc2 = __SMLALDX(x2, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT(acc0 >> 15, 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q15_t) (__SSAT(acc1 >> 15, 16)); + pOut += inc; + + *pOut = (q15_t) (__SSAT(acc2 >> 15, 16)); + pOut += inc; + + *pOut = (q15_t) (__SSAT(acc3 >> 15, 16)); + pOut += inc; + + /* Increment the count by 4 as 4 output values are computed */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q63_t) * px++ * *py++); + sum += ((q63_t) * px++ * *py++); + sum += ((q63_t) * px++ * *py++); + sum += ((q63_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q63_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT(sum >> 15, 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment count by 1, as one output value is computed */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q63_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT(sum >> 15, 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT((sum >> 15), 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q15_t *pIn1 = pSrcA; /* inputA pointer */ + q15_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ + q63_t sum; /* Accumulators */ + uint32_t i = 0u, j; /* loop counters */ + uint32_t inv = 0u; /* Reverse order flag */ + uint32_t tot = 0u; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we include zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using convolution but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2u); + + if(srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if(srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1u); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate convolution for output length number of times */ + for (i = 0u; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0u; j <= i; j++) + { + /* Check the array limitations */ + if((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]); + } + } + /* Store the output in the destination buffer */ + if(inv == 1) + *pDst-- = (q15_t) __SSAT((sum >> 15u), 16u); + else + *pDst++ = (q15_t) __SSAT((sum >> 15u), 16u); + } + +#endif /*#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c new file mode 100644 index 000000000..c81f8600d --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c @@ -0,0 +1,664 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_q31.c +* +* Description: Correlation of Q31 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q31 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * See arm_correlate_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +void arm_correlate_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1; /* Intermediate pointers */ + q63_t sum, acc0, acc1, acc2; /* Accumulators */ + q31_t x0, x1, x2, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1u); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py++); + /* x[1] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py++); + /* x[2] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py++); + /* x[3] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll by 3 */ + blkCnt = blockSize2 / 3; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* read x[0], x[1] samples */ + x0 = *(px++); + x1 = *(px++); + + /* Apply loop unrolling and compute 3 MACs simultaneously. */ + k = srcBLen / 3; + + /* First part of the processing with loop unrolling. Compute 3 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 2 samples. */ + do + { + /* Read y[0] sample */ + c0 = *(py); + + /* Read x[2] sample */ + x2 = *(px); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[0] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[1] * y[0] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[2] * y[0] */ + acc2 += ((q63_t) x2 * c0); + + /* Read y[1] sample */ + c0 = *(py + 1u); + + /* Read x[3] sample */ + x0 = *(px + 1u); + + /* Perform the multiply-accumulates */ + /* acc0 += x[1] * y[1] */ + acc0 += ((q63_t) x1 * c0); + /* acc1 += x[2] * y[1] */ + acc1 += ((q63_t) x2 * c0); + /* acc2 += x[3] * y[1] */ + acc2 += ((q63_t) x0 * c0); + + /* Read y[2] sample */ + c0 = *(py + 2u); + + /* Read x[4] sample */ + x1 = *(px + 2u); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[2] */ + acc0 += ((q63_t) x2 * c0); + /* acc1 += x[3] * y[2] */ + acc1 += ((q63_t) x0 * c0); + /* acc2 += x[4] * y[2] */ + acc2 += ((q63_t) x1 * c0); + + /* update scratch pointers */ + px += 3u; + py += 3u; + + } while(--k); + + /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen - (3 * (srcBLen / 3)); + + while(k > 0u) + { + /* Read y[4] sample */ + c0 = *(py++); + + /* Read x[7] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[5] * y[4] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[6] * y[4] */ + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (acc0 >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q31_t) (acc1 >> 31); + pOut += inc; + + *pOut = (q31_t) (acc2 >> 31); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 3 */ + count += 3u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 - 3 * (blockSize2 / 3); + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py++); + sum += (q63_t) * px++ * (*py++); + sum += (q63_t) * px++ * (*py++); + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum += (q63_t) * px++ * (*py++); + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + sum += (q63_t) * px++ * (*py++); + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum += (q63_t) * px++ * (*py++); + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t *pIn1 = pSrcA; /* inputA pointer */ + q31_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ + q63_t sum; /* Accumulators */ + uint32_t i = 0u, j; /* loop counters */ + uint32_t inv = 0u; /* Reverse order flag */ + uint32_t tot = 0u; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we include zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using correlation but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2u); + + if(srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if(srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1u); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate correlation for output length number of times */ + for (i = 0u; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to correlation equation */ + for (j = 0u; j <= i; j++) + { + /* Check the array limitations */ + if((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]); + } + } + /* Store the output in the destination buffer */ + if(inv == 1) + *pDst-- = (q31_t) (sum >> 31u); + else + *pDst++ = (q31_t) (sum >> 31u); + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c new file mode 100644 index 000000000..e03e4997b --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c @@ -0,0 +1,789 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_correlate_q7.c +* +* Description: Correlation of Q7 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.11 2011/10/18 +* Bug Fix in conv, correlation, partial convolution. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. + * + * \par + * Refer the function arm_correlate_opt_q7() for a faster implementation of this function. + * + */ + +void arm_correlate_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q7_t *pOut = pDst; /* output pointer */ + q7_t *px; /* Intermediate inputA pointer */ + q7_t *py; /* Intermediate inputB pointer */ + q7_t *pSrc1; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t input1, input2; /* temporary variables */ + q15_t in1, in2; /* temporary variables */ + q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2u * srcALen) - 1u; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1u)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2u); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1u); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] , x[1] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 4] , y[srcBLen - 3] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[0] * y[srcBLen - 4] */ + /* x[1] * y[srcBLen - 3] */ + sum = __SMLAD(input1, input2, sum); + + /* x[2] , x[3] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 2] , y[srcBLen - 1] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[2] * y[srcBLen - 2] */ + /* x[3] * y[srcBLen - 1] */ + sum = __SMLAD(input1, input2, sum); + + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum += (q31_t) ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[0] sample */ + c0 = *py++; + /* Read y[1] sample */ + c1 = *py++; + + /* Read x[3] sample */ + x3 = *px++; + + /* x[0] and x[1] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[0] and y[1] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[1] and x[2] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[4] sample */ + x0 = *(px++); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLAD(input1, input2, acc3); + + /* Read y[2] sample */ + c0 = *py++; + /* Read y[3] sample */ + c1 = *py++; + + /* Read x[5] sample */ + x1 = *px++; + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[2] and y[3] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[4] and x[5] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[6] sample */ + x2 = *px++; + + /* x[5] and x[6] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLAD(input1, input2, acc3); + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[4] sample */ + c0 = *py++; + + /* Read x[7] sample */ + x3 = *px++; + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 += ((q15_t) x0 * c0); + /* acc1 += x[5] * y[4] */ + acc1 += ((q15_t) x1 * c0); + /* acc2 += x[6] * y[4] */ + acc2 += ((q15_t) x2 * c0); + /* acc3 += x[7] * y[4] */ + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(acc0 >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q7_t) (__SSAT(acc1 >> 7, 8)); + pOut += inc; + + *pOut = (q7_t) (__SSAT(acc2 >> 7, 8)); + pOut += inc; + + *pOut = (q7_t) (__SSAT(acc3 >> 7, 8)); + pOut += inc; + + count += 4u; + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[0] , y[1] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum = __SMLAD(input1, input2, sum); + + /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[2] , y[3] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q7_t *pIn1 = pSrcA; /* inputA pointer */ + q7_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ + q31_t sum; /* Accumulator */ + uint32_t i = 0u, j; /* loop counters */ + uint32_t inv = 0u; /* Reverse order flag */ + uint32_t tot = 0u; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we include zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using convolution but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2u); + + if(srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if(srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1u); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate convolution for output length number of times */ + for (i = 0u; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0u; j <= i; j++) + { + /* Check the array limitations */ + if((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]); + } + } + /* Store the output in the destination buffer */ + if(inv == 1) + *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u); + else + *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u); + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of Corr group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_f32.c new file mode 100644 index 000000000..1546f407d --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_f32.c @@ -0,0 +1,518 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_f32.c +* +* Description: FIR decimation for floating-point sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator + * + * These functions combine an FIR filter together with a decimator. + * They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion. + * Conceptually, the functions are equivalent to the block diagram below: + * \image html FIRDecimator.gif "Components included in the FIR Decimator functions" + * When decimating by a factor of M, the signal should be prefiltered by a lowpass filter with a normalized + * cutoff frequency of 1/M in order to prevent aliasing distortion. + * The user of the function is responsible for providing the filter coefficients. + * + * The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner. + * Instead of calculating all of the FIR filter outputs and discarding M-1 out of every M, only the + * samples output by the decimator are computed. + * The functions operate on blocks of input and output data. + * pSrc points to an array of blockSize input values and + * pDst points to an array of blockSize/M output values. + * In order to have an integer number of output samples blockSize + * must always be a multiple of the decimation factor M. + * + * The library provides separate functions for Q15, Q31 and floating-point data types. + * + * \par Algorithm: + * The FIR portion of the algorithm uses the standard form filter: + *
    
+ *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]    
+ * 
+ * where, b[n] are the filter coefficients. + * \par + * The pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the order: + * \par + *
    
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}    
+ * 
+ * The state variables are updated after each block of data is processed, the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable array should be allocated separately. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * - Checks to make sure that the size of the input is a multiple of the decimation factor. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * The code below statically initializes each of the 3 different data type filter instance structures + *
    
+ *arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};    
+ *arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};    
+ *arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};    
+ * 
+ * where M is the decimation factor; numTaps is the number of filter coefficients in the filter; + * pCoeffs is the address of the coefficient buffer; + * pState is the address of the state buffer. + * Be sure to set the values in the state buffer to zeros when doing static initialization. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR decimate filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + + /** + * @brief Processing function for the floating-point FIR decimator. + * @param[in] *S points to an instance of the floating-point FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ + +void arm_fir_decimate_f32( + const arm_fir_decimate_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t sum0; /* Accumulator */ + float32_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + +#ifndef ARM_MATH_CM0 + + uint32_t blkCntN4; + float32_t *px0, *px1, *px2, *px3; + float32_t acc0, acc1, acc2, acc3; + float32_t x1, x2, x3; + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 4; + blkCntN4 = outBlockSize - (4 * blkCnt); + + while(blkCnt > 0u) + { + /* Copy 4 * decimation factor number of new input samples into the state buffer */ + i = 4 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* Initialize state pointer for all the samples */ + px0 = pState; + px1 = pState + S->M; + px2 = pState + 2 * S->M; + px3 = pState + 3 * S->M; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample for acc0 */ + x0 = *(px0++); + /* Read x[n-numTaps-1] sample for acc1 */ + x1 = *(px1++); + /* Read x[n-numTaps-1] sample for acc2 */ + x2 = *(px2++); + /* Read x[n-numTaps-1] sample for acc3 */ + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch state variables for acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + 4 * S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + + /* Decrement the loop counter */ + blkCnt--; + } + + while(blkCntN4 > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Decrement the loop counter */ + blkCntN4--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + i = (numTaps - 1u); + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c new file mode 100644 index 000000000..96f50e6f2 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c @@ -0,0 +1,590 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_fast_q15.c +* +* Description: Fast Q15 FIR Decimator. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. + * @param[in] *S points to an instance of the Q15 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + * @return none + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, state buffers should be aligned by 32-bit + * + * Scaling and Overflow Behavior: + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around and distorts the result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2). + * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. + * + * \par + * Refer to the function arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. + * Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_decimate_init_q15() to initialize the filter structure. + */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + +void arm_fir_decimate_fast_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */ + q31_t sum0; /* Accumulators */ + q31_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLAD(x0, c0, acc0); + + acc1 = __SMLAD(x1, c0, acc1); + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLAD(x0, c0, acc0); + + acc1 = __SMLAD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + + + while(blkCntN3 > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px)++; + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c1 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLAD(x0, c0, sum0); + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLAD(x0, c1, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLAD(x0, c0, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2u; + + /* copy data */ + while(i > 0u) + { + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +#else + + +void arm_fir_decimate_fast_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */ + q31_t sum0; /* Accumulators */ + q31_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] for sample 0 and for sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + + /* Decrement the loop counter */ + blkCnt--; + } + + while(blkCntN3 > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c new file mode 100644 index 000000000..a43fd0b4c --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c @@ -0,0 +1,343 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_fast_q31.c +* +* Description: Fast Q31 FIR Decimator. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. + * @param[in] *S points to an instance of the Q31 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + * @return none + * + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are added to a 2.30 accumulator. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2). + * + * \par + * Refer to the function arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. + * Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_decimate_init_q31() to initialize the filter structure. + */ + +void arm_fir_decimate_fast_q31( + arm_fir_decimate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + q31_t *px; /* Temporary pointers for state buffer */ + q31_t *pb; /* Temporary pointers for coefficient buffer */ + q31_t sum0; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + uint32_t blkCntN2; + q31_t x1; + q31_t acc0, acc1; + q31_t *px0, *px1; + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Total number of output samples to be computed */ + + blkCnt = outBlockSize / 2; + blkCntN2 = outBlockSize - (2 * blkCnt); + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + px1 = pState + S->M; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb); + + /* Read x[n-numTaps-1] for sample 0 sample 1 */ + x0 = *(px0); + x1 = *(px1); + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb + 1u); + + /* Read x[n-numTaps-2] for sample 0 sample 1 */ + x0 = *(px0 + 1u); + x1 = *(px1 + 1u); + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb + 2u); + + /* Read x[n-numTaps-3] for sample 0 sample 1 */ + x0 = *(px0 + 2u); + x1 = *(px1 + 2u); + pb += 4u; + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb - 1u); + + /* Read x[n-numTaps-4] for sample 0 sample 1 */ + x0 = *(px0 + 3u); + x1 = *(px1 + 3u); + + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* update state pointers */ + px0 += 4u; + px1 += 4u; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px0++); + x1 = *(px1++); + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (acc0 << 1); + *pDst++ = (q31_t) (acc1 << 1); + + /* Decrement the loop counter */ + blkCnt--; + } + + while(blkCntN2 > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 << 1); + + /* Decrement the loop counter */ + blkCntN2--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_f32.c new file mode 100644 index 000000000..b655a6be6 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_f32.c @@ -0,0 +1,112 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_init_f32.c +* +* Description: Floating-point FIR Decimator initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Initialization function for the floating-point FIR decimator. + * @param[in,out] *S points to an instance of the floating-point FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_f32(). + * M is the decimation factor. + */ + +arm_status arm_fir_decimate_init_f32( + arm_fir_decimate_instance_f32 * S, + uint16_t numTaps, + uint8_t M, + float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The size of the input block must be a multiple of the decimation factor */ + if((blockSize % M) != 0u) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Decimation Factor */ + S->M = M; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q15.c new file mode 100644 index 000000000..1a1fee8a2 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q15.c @@ -0,0 +1,114 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_init_q15.c +* +* Description: Initialization function for the Q15 FIR Decimator. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Initialization function for the Q15 FIR decimator. + * @param[in,out] *S points to an instance of the Q15 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples + * to the call arm_fir_decimate_q15(). + * M is the decimation factor. + */ + +arm_status arm_fir_decimate_init_q15( + arm_fir_decimate_instance_q15 * S, + uint16_t numTaps, + uint8_t M, + q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + + arm_status status; + + /* The size of the input block must be a multiple of the decimation factor */ + if((blockSize % M) != 0u) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size of buffer is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Decimation factor */ + S->M = M; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q31.c new file mode 100644 index 000000000..e2ac8165e --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q31.c @@ -0,0 +1,112 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_init_q31.c +* +* Description: Initialization function for Q31 FIR Decimation filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Initialization function for the Q31 FIR decimator. + * @param[in,out] *S points to an instance of the Q31 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_q31(). + * M is the decimation factor. + */ + +arm_status arm_fir_decimate_init_q31( + arm_fir_decimate_instance_q31 * S, + uint16_t numTaps, + uint8_t M, + q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The size of the input block must be a multiple of the decimation factor */ + if((blockSize % M) != 0u) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Decimation factor */ + S->M = M; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q15.c new file mode 100644 index 000000000..28b5f13dd --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q15.c @@ -0,0 +1,691 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_q15.c +* +* Description: Q15 FIR Decimator. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR decimator. + * @param[in] *S points to an instance of the Q15 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + * + * \par + * Refer to the function arm_fir_decimate_fast_q15() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +#ifndef ARM_MATH_CM0 + +#ifndef UNALIGNED_SUPPORT_DISABLE + +void arm_fir_decimate_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */ + q63_t sum0; /* Accumulators */ + q63_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLALD(x0, c0, acc0); + + acc1 = __SMLALD(x1, c0, acc1); + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLALD(x0, c0, acc0); + + acc1 = __SMLALD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + + + while(blkCntN3 > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px)++; + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c1 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLALD(x0, c0, sum0); + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLALD(x0, c1, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLALD(x0, c0, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2u; + + /* copy data */ + while(i > 0u) + { + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +#else + + +void arm_fir_decimate_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */ + q63_t sum0; /* Accumulators */ + q63_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] for sample 0 and for sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + while(blkCntN3 > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#else + + +void arm_fir_decimate_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + q63_t sum0; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + +/* Run the below code for Cortex-M0 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += (q31_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /*Store filter output , smlad will return the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = numTaps - 1u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + +} +#endif /* #ifndef ARM_MATH_CM0 */ + + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q31.c new file mode 100644 index 000000000..59c0fed7f --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_q31.c @@ -0,0 +1,306 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_decimate_q31.c +* +* Description: Q31 FIR Decimator. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q31 FIR decimator. + * @param[in] *S points to an instance of the Q31 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + * @return none + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2). + * After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format. + * + * \par + * Refer to the function arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +void arm_fir_decimate_q31( + const arm_fir_decimate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + q31_t *px; /* Temporary pointers for state buffer */ + q31_t *pb; /* Temporary pointers for coefficient buffer */ + q63_t sum0; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 >> 31); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while(blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while(--i); + + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 >> 31); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = numTaps - 1u; + + /* copy data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_f32.c new file mode 100644 index 000000000..7f951f86b --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_f32.c @@ -0,0 +1,554 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_f32.c +* +* Description: Floating-point FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR Finite Impulse Response (FIR) Filters + * + * This set of functions implements Finite Impulse Response (FIR) filters + * for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided. + * The functions operate on blocks of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst points to input and output arrays containing blockSize values. + * + * \par Algorithm: + * The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations. + * Each filter coefficient b[n] is multiplied by a state variable which equals a previous input sample x[n]. + *
  
+ *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]  
+ * 
+ * \par + * \image html FIR.gif "Finite Impulse Response filter" + * \par + * pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
  
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}  
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the following order. + * \par + *
  
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}  
+ * 
+ * \par + * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1. + * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters, + * to be avoided and yields a significant speed improvement. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 4 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 4 different data type filter instance structures + *
  
+ *arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};  
+ *arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};  
+ *arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};  
+ *arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};  
+ * 
+ * + * where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * + * @param[in] *S points to an instance of the floating-point FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + */ + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_f32( + const arm_fir_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */ + float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 3; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + acc4 = 0.0f; + acc5 = 0.0f; + acc6 = 0.0f; + acc7 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the first three samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + x3 = *px++; + x4 = *px++; + x5 = *px++; + x6 = *px++; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 3u; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x7 = *(px++); + + /* acc0 += b[numTaps-1] * x[n-numTaps] */ + acc0 += x0 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-1] */ + acc1 += x1 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-2] */ + acc2 += x2 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-3] */ + acc3 += x3 * c0; + + /* acc4 += b[numTaps-1] * x[n-numTaps-4] */ + acc4 += x4 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-5] */ + acc5 += x5 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-6] */ + acc6 += x6 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-7] */ + acc7 += x7 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + acc0 += x1 * c0; + acc1 += x2 * c0; + acc2 += x3 * c0; + acc3 += x4 * c0; + acc4 += x5 * c0; + acc5 += x6 * c0; + acc6 += x7 * c0; + acc7 += x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x2 * c0; + acc1 += x3 * c0; + acc2 += x4 * c0; + acc3 += x5 * c0; + acc4 += x6 * c0; + acc5 += x7 * c0; + acc6 += x0 * c0; + acc7 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x3 * c0; + acc1 += x4 * c0; + acc2 += x5 * c0; + acc3 += x6 * c0; + acc4 += x7 * c0; + acc5 += x0 * c0; + acc6 += x1 * c0; + acc7 += x2 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x4 * c0; + acc1 += x5 * c0; + acc2 += x6 * c0; + acc3 += x7 * c0; + acc4 += x0 * c0; + acc5 += x1 * c0; + acc6 += x2 * c0; + acc7 += x3 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x4 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x5 * c0; + acc1 += x6 * c0; + acc2 += x7 * c0; + acc3 += x0 * c0; + acc4 += x1 * c0; + acc5 += x2 * c0; + acc6 += x3 * c0; + acc7 += x4 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x5 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x6 * c0; + acc1 += x7 * c0; + acc2 += x0 * c0; + acc3 += x1 * c0; + acc4 += x2 * c0; + acc5 += x3 * c0; + acc6 += x4 * c0; + acc7 += x5 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x6 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x7 * c0; + acc1 += x0 * c0; + acc2 += x1 * c0; + acc3 += x2 * c0; + acc4 += x3 * c0; + acc5 += x4 * c0; + acc6 += x5 * c0; + acc7 += x6 * c0; + + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x8u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x7 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + acc4 += x4 * c0; + acc5 += x5 * c0; + acc6 += x6 * c0; + acc7 += x7 * c0; + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 8; + + /* The results in the 4 accumulators, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + *pDst++ = acc4; + *pDst++ = acc5; + *pDst++ = acc6; + *pDst++ = acc7; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x8u; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += *px++ * *pb++; + i--; + + } while(i > 0u); + + /* The result is store in the destination buffer. */ + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else + +void arm_fir_f32( + const arm_fir_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* Run the below code for Cortex-M0 */ + + float32_t acc; + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += *px++ * *pb++; + i--; + + } while(i > 0u); + + /* The result is store in the destination buffer. */ + *pDst++ = acc; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the starting of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = numTaps - 1u; + + /* Copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #ifndef ARM_MATH_CM0 */ + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q15.c new file mode 100644 index 000000000..59abff2f5 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q15.c @@ -0,0 +1,341 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_fast_q15.c +* +* Description: Q15 Fast FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.9 2010/08/16 +* Initial version +* +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q15 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around and distorts the result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. + * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. + * + * \par + * Refer to the function arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_init_q15() to initialize the filter structure. + */ + +void arm_fir_fast_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ + px = pState; + + /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = *__SIMD32(px)++; + + /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ + x2 = *__SIMD32(px)++; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLAD(x0, c0, acc0); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLAD(x2, c0, acc2); + + /* pack x[n-N-1] and x[n-N-2] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* pack x[n-N-3] and x[n-N-4] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLADX(x1, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLAD(x2, c0, acc0); + + /* Read state x[n-N-6], x[n-N-7] with offset */ + x2 = _SIMD32_OFFSET(px + 2u); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLAD(x0, c0, acc2); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* pack x[n-N-5] and x[n-N-6] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLADX(x1, c0, acc3); + + /* Update state pointer for next state reading */ + px += 4u; + + /* Decrement tap count */ + tapCnt--; + + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if((numTaps & 0x3u) != 0u) + { + + /* Read last two coefficients */ + c0 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc2 = __SMLAD(x2, c0, acc2); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read last state variables */ + x0 = *__SIMD32(px); + + /* Perform the multiply-accumulates */ + acc1 = __SMLADX(x1, c0, acc1); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* Perform the multiply-accumulates */ + acc3 = __SMLADX(x1, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4u; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + while(blkCnt > 0u) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Use SIMD to hold states and coefficients */ + px = pState; + pb = pCoeffs; + + tapCnt = numTaps >> 1u; + + do + { + + acc0 += (q31_t) * px++ * *pb++; + acc0 += (q31_t) * px++ * *pb++; + + tapCnt--; + } + while(tapCnt > 0u); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1u; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1u) >> 2; + + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* copy remaining data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c new file mode 100644 index 000000000..e384ff973 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c @@ -0,0 +1,309 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_fast_q31.c +* +* Description: Processing function for the Q31 Fast FIR filter. +* +* Target Processor: Cortex-M4/Cortex-M3 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.9 2010/08/27 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q31 structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are added to a 2.30 accumulator. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. + * + * \par + * Refer to the function arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_init_q31() to initialize the filter structure. + */ + +void arm_fir_fast_q31( + const arm_fir_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t x0, x1, x2, x3; /* Temporary variables to hold state */ + q31_t c0; /* Temporary variable to hold coefficient value */ + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first three samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + i = tapCnt; + + while(i > 0u) + { + /* Read the b[numTaps] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x3 = *(px++); + + /* acc0 += b[numTaps] * x[n-numTaps] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* acc1 += b[numTaps] * x[n-numTaps-1] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* acc2 += b[numTaps] * x[n-numTaps-2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + + /* acc3 += b[numTaps] * x[n-numTaps-3] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read the b[numTaps-3] coefficients */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); + i--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + + i = numTaps - (tapCnt * 4u); + while(i > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31 + ** Then store the 4 outputs in the destination buffer. */ + *pDst++ = (q31_t) (acc0 << 1); + *pDst++ = (q31_t) (acc1 << 1); + *pDst++ = (q31_t) (acc2 << 1); + *pDst++ = (q31_t) (acc3 << 1); + + /* Decrement the samples loop counter */ + blkCnt--; + } + + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 4u; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 = + (q31_t) ((((q63_t) acc0 << 32) + + ((q63_t) (*px++) * (*(pb++)))) >> 32); + i--; + } while(i > 0u); + + /* The result is in 2.30 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc0 << 1); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_f32.c new file mode 100644 index 000000000..aaeddf5f6 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_f32.c @@ -0,0 +1,94 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_f32.c +* +* Description: Floating-point FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the floating-point FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f32(). + */ + +void arm_fir_init_f32( + arm_fir_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q15.c new file mode 100644 index 000000000..02bb33623 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q15.c @@ -0,0 +1,152 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_q15.c +* +* Description: Q15 FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* ------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in,out] *S points to an instance of the Q15 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. Must be even and greater than or equal to 4. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize is number of samples processed per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if + * numTaps is not greater than or equal to 4 and even. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * Note that numTaps must be even and greater than or equal to 4. + * To implement an odd length filter simply increase numTaps by 1 and set the last coefficient to zero. + * For example, to implement a filter with numTaps=3 and coefficients + *
    
+ *     {0.3, -0.8, 0.3}    
+ * 
+ * set numTaps=4 and use the coefficients: + *
    
+ *     {0.3, -0.8, 0.3, 0}.    
+ * 
+ * Similarly, to implement a two point filter + *
    
+ *     {0.3, -0.3}    
+ * 
+ * set numTaps=4 and use the coefficients: + *
    
+ *     {0.3, -0.3, 0, 0}.    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize, when running on Cortex-M4 and Cortex-M3 and is of length numTaps+blockSize-1, when running on Cortex-M0 where blockSize is the number of input samples processed by each call to arm_fir_q15(). + */ + +arm_status arm_fir_init_q15( + arm_fir_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + arm_status status; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* The Number of filter coefficients in the filter must be even and at least 4 */ + if(numTaps & 0x1u) + { + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps ) */ + memset(pState, 0, (numTaps + (blockSize)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + + return (status); + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q31.c new file mode 100644 index 000000000..997a78427 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q31.c @@ -0,0 +1,94 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_q31.c +* +* Description: Q31 FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the Q31 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q31(). + */ + +void arm_fir_init_q31( + arm_fir_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and state array size is (blockSize + numTaps - 1) */ + memset(pState, 0, (blockSize + ((uint32_t) numTaps - 1u)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q7.c new file mode 100644 index 000000000..65a49a823 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q7.c @@ -0,0 +1,92 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_q7.c +* +* Description: Q7 FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* ------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ +/** + * @param[in,out] *S points to an instance of the Q7 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q7(). + */ + +void arm_fir_init_q7( + arm_fir_instance_q7 * S, + uint16_t numTaps, + q7_t * pCoeffs, + q7_t * pState, + uint32_t blockSize) +{ + + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q7_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_f32.c new file mode 100644 index 000000000..c966a0ced --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_f32.c @@ -0,0 +1,574 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_interpolate_f32.c +* +* Description: FIR interpolation for floating-point sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator + * + * These functions combine an upsampler (zero stuffer) and an FIR filter. + * They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images. + * Conceptually, the functions are equivalent to the block diagram below: + * \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions" + * After upsampling by a factor of L, the signal should be filtered by a lowpass filter with a normalized + * cutoff frequency of 1/L in order to eliminate high frequency copies of the spectrum. + * The user of the function is responsible for providing the filter coefficients. + * + * The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner. + * The upsampler inserts L-1 zeros between each sample. + * Instead of multiplying by these zero values, the FIR filter is designed to skip them. + * This leads to an efficient implementation without any wasted effort. + * The functions operate on blocks of input and output data. + * pSrc points to an array of blockSize input values and + * pDst points to an array of blockSize*L output values. + * + * The library provides separate functions for Q15, Q31, and floating-point data types. + * + * \par Algorithm: + * The functions use a polyphase filter structure: + *
    
+ *    y[n] = b[0] * x[n] + b[L]   * x[n-1] + ... + b[L*(phaseLength-1)] * x[n-phaseLength+1]    
+ *    y[n+1] = b[1] * x[n] + b[L+1] * x[n-1] + ... + b[L*(phaseLength-1)+1] * x[n-phaseLength+1]    
+ *    ...    
+ *    y[n+(L-1)] = b[L-1] * x[n] + b[2*L-1] * x[n-1] + ....+ b[L*(phaseLength-1)+(L-1)] * x[n-phaseLength+1]    
+ * 
+ * This approach is more efficient than straightforward upsample-then-filter algorithms. + * With this method the computation is reduced by a factor of 1/L when compared to using a standard FIR filter. + * \par + * pCoeffs points to a coefficient array of size numTaps. + * numTaps must be a multiple of the interpolation factor L and this is checked by the + * initialization functions. + * Internally, the function divides the FIR filter's impulse response into shorter filters of length + * phaseLength=numTaps/L. + * Coefficients are stored in time reversed order. + * \par + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to a state array of size blockSize + phaseLength - 1. + * Samples in the state buffer are stored in the order: + * \par + *
    
+ *    {x[n-phaseLength+1], x[n-phaseLength], x[n-phaseLength-1], x[n-phaseLength-2]....x[0], x[1], ..., x[blockSize-1]}    
+ * 
+ * The state variables are updated after each block of data is processed, the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable array should be allocated separately. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * - Checks to make sure that the length of the filter is a multiple of the interpolation factor. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * The code below statically initializes each of the 3 different data type filter instance structures + *
    
+ * arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};    
+ * arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};    
+ * arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};    
+ * 
+ * where L is the interpolation factor; phaseLength=numTaps/L is the + * length of each of the shorter FIR filters used internally, + * pCoeffs is the address of the coefficient buffer; + * pState is the address of the state buffer. + * Be sure to set the values in the state buffer to zeros when doing static initialization. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR interpolate filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Processing function for the floating-point FIR interpolator. + * @param[in] *S points to an instance of the floating-point FIR interpolator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_interpolate_f32( + const arm_fir_interpolate_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + float32_t sum0; /* Accumulators */ + float32_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, j; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + float32_t acc0, acc1, acc2, acc3; + float32_t x1, x2, x3; + uint32_t blkCntN4; + float32_t c1, c2, c3; + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (phaseLen - 1u); + + /* Initialise blkCnt */ + blkCnt = blockSize / 4; + blkCntN4 = blockSize - (4 * blkCnt); + + /* Samples loop unrolled by 4 */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1u; + + /* Loop over the Interpolation factor. */ + i = (S->L); + + while(i > 0u) + { + /* Set accumulator to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2u; + + x0 = *(ptr1++); + x1 = *(ptr1++); + x2 = *(ptr1++); + + while(tapCnt > 0u) + { + + /* Read the input sample */ + x3 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the coefficient */ + c1 = *(ptr2 + S->L); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += x1 * c1; + acc1 += x2 * c1; + acc2 += x3 * c1; + acc3 += x0 * c1; + + /* Read the coefficient */ + c2 = *(ptr2 + S->L * 2); + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += x2 * c2; + acc1 += x3 * c2; + acc2 += x0 * c2; + acc3 += x1 * c2; + + /* Read the coefficient */ + c3 = *(ptr2 + S->L * 3); + + /* Read the input sample */ + x2 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += x3 * c3; + acc1 += x0 * c3; + acc2 += x1 * c3; + acc3 += x2 * c3; + + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += 4 * S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4u; + + while(tapCnt > 0u) + { + + /* Read the input sample */ + x3 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* update states for next sample processing */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst = acc0; + *(pDst + S->L) = acc1; + *(pDst + 2 * S->L) = acc2; + *(pDst + 3 * S->L) = acc3; + + pDst++; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 4; + + pDst += S->L * 3; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + + while(blkCntN4 > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1u; + + /* Loop over the Interpolation factor. */ + i = S->L; + while(i > 0u) + { + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2u; + while(tapCnt > 0u) + { + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum0 += *(ptr1++) * (*ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCntN4--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (phaseLen - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (phaseLen - 1u) % 0x04u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else + + /* Run the below code for Cortex-M0 */ + +void arm_fir_interpolate_f32( + const arm_fir_interpolate_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + + + float32_t sum; /* Accumulator */ + uint32_t i, blkCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (phaseLen - 1u); + + /* Total number of intput samples */ + blkCnt = blockSize; + + /* Loop over the blockSize. */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Loop over the Interpolation factor. */ + i = S->L; + + while(i > 0u) + { + /* Set accumulator to zero */ + sum = 0.0f; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (i - 1u); + + /* Loop over the polyPhase length */ + tapCnt = phaseLen; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum += *ptr1++ * *ptr2; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = phaseLen - 1u; + + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #ifndef ARM_MATH_CM0 */ + + + + /** + * @} end of FIR_Interpolate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c new file mode 100644 index 000000000..b19347264 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c @@ -0,0 +1,116 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_interpolate_init_f32.c +* +* Description: Floating-point FIR interpolator initialization function +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Initialization function for the floating-point FIR interpolator. + * @param[in,out] *S points to an instance of the floating-point FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficient buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}    
+ * 
+ * The length of the filter numTaps must be a multiple of the interpolation factor L. + * \par + * pState points to the array of state variables. + * pState is of length (numTaps/L)+blockSize-1 words + * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_f32(). + */ + +arm_status arm_fir_interpolate_init_f32( + arm_fir_interpolate_instance_f32 * S, + uint8_t L, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The filter length must be a multiple of the interpolation factor */ + if((numTaps % L) != 0u) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign Interpolation factor */ + S->L = L; + + /* Assign polyPhaseLength */ + S->phaseLength = numTaps / L; + + /* Clear state buffer and size of state array is always phaseLength + blockSize - 1 */ + memset(pState, 0, + (blockSize + + ((uint32_t) S->phaseLength - 1u)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + + /** + * @} end of FIR_Interpolate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c new file mode 100644 index 000000000..b344fc785 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c @@ -0,0 +1,115 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_interpolate_init_q15.c +* +* Description: Q15 FIR interpolator initialization function +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Initialization function for the Q15 FIR interpolator. + * @param[in,out] *S points to an instance of the Q15 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficient buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}    
+ * 
+ * The length of the filter numTaps must be a multiple of the interpolation factor L. + * \par + * pState points to the array of state variables. + * pState is of length (numTaps/L)+blockSize-1 words + * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q15(). + */ + +arm_status arm_fir_interpolate_init_q15( + arm_fir_interpolate_instance_q15 * S, + uint8_t L, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The filter length must be a multiple of the interpolation factor */ + if((numTaps % L) != 0u) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign Interpolation factor */ + S->L = L; + + /* Assign polyPhaseLength */ + S->phaseLength = numTaps / L; + + /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */ + memset(pState, 0, + (blockSize + ((uint32_t) S->phaseLength - 1u)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + + /** + * @} end of FIR_Interpolate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c new file mode 100644 index 000000000..f0383b8e9 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c @@ -0,0 +1,116 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_interpolate_init_q31.c +* +* Description: Q31 FIR interpolator initialization function +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + + +/** + * @brief Initialization function for the Q31 FIR interpolator. + * @param[in,out] *S points to an instance of the Q31 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficient buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}    
+ * 
+ * The length of the filter numTaps must be a multiple of the interpolation factor L. + * \par + * pState points to the array of state variables. + * pState is of length (numTaps/L)+blockSize-1 words + * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q31(). + */ + +arm_status arm_fir_interpolate_init_q31( + arm_fir_interpolate_instance_q31 * S, + uint8_t L, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The filter length must be a multiple of the interpolation factor */ + if((numTaps % L) != 0u) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign Interpolation factor */ + S->L = L; + + /* Assign polyPhaseLength */ + S->phaseLength = numTaps / L; + + /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */ + memset(pState, 0, + (blockSize + ((uint32_t) S->phaseLength - 1u)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + + /** + * @} end of FIR_Interpolate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q15.c new file mode 100644 index 000000000..d17d8d803 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q15.c @@ -0,0 +1,503 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_interpolate_q15.c +* +* Description: Q15 FIR interpolation. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR interpolator. + * @param[in] *S points to an instance of the Q15 FIR interpolator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + */ + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_interpolate_q15( + const arm_fir_interpolate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + q63_t sum0; /* Accumulators */ + q15_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, j, tapCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */ + uint32_t blkCntN2; + q63_t acc0, acc1; + q15_t x1; + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + ((q31_t) phaseLen - 1); + + /* Initialise blkCnt */ + blkCnt = blockSize / 2; + blkCntN2 = blockSize - (2 * blkCnt); + + /* Samples loop unrolled by 2 */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1u; + + /* Loop over the Interpolation factor. */ + i = (S->L); + + while(i > 0u) + { + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2u; + + x0 = *(ptr1++); + + while(tapCnt > 0u) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 2); + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 3); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += 4 * S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4u; + + while(tapCnt > 0u) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* update states for next sample processing */ + x0 = x1; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst = (q15_t) (__SSAT((acc0 >> 15), 16)); + *(pDst + S->L) = (q15_t) (__SSAT((acc1 >> 15), 16)); + + pDst++; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 2; + + pDst += S->L; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blkCntN2; + + /* Loop over the blockSize. */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1u; + + /* Loop over the Interpolation factor. */ + i = S->L; + while(i > 0u) + { + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2; + while(tapCnt > 0u) + { + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen & 0x3u; + + while(tapCnt > 0u) + { + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = ((uint32_t) phaseLen - 1u) >> 2u; + + /* copy data */ + while(i > 0u) + { +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + +#else + + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + i--; + } + + i = ((uint32_t) phaseLen - 1u) % 0x04u; + + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +#else + + /* Run the below code for Cortex-M0 */ + +void arm_fir_interpolate_q15( + const arm_fir_interpolate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + q63_t sum; /* Accumulator */ + q15_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, tapCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */ + + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (phaseLen - 1u); + + /* Total number of intput samples */ + blkCnt = blockSize; + + /* Loop over the blockSize. */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Loop over the Interpolation factor. */ + i = S->L; + + while(i > 0u) + { + /* Set accumulator to zero */ + sum = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (i - 1u); + + /* Loop over the polyPhase length */ + tapCnt = (uint32_t) phaseLen; + + while(tapCnt > 0u) + { + /* Read the coefficient */ + c0 = *ptr2; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *ptr1++; + + /* Perform the multiply-accumulate */ + sum += ((q31_t) x0 * c0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Store the result after converting to 1.15 format in the destination buffer */ + *pDst++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (uint32_t) phaseLen - 1u; + + while(i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +} + +#endif /* #ifndef ARM_MATH_CM0 */ + + + /** + * @} end of FIR_Interpolate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q31.c new file mode 100644 index 000000000..689b4d337 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q31.c @@ -0,0 +1,499 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_interpolate_q31.c +* +* Description: Q31 FIR interpolation. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Processing function for the Q31 FIR interpolator. + * @param[in] *S points to an instance of the Q31 FIR interpolator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 1/(numTaps/L). + * since numTaps/L additions occur per output sample. + * After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format. + */ + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_interpolate_q31( + const arm_fir_interpolate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + q63_t sum0; /* Accumulators */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, j; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + + uint32_t blkCntN2; + q63_t acc0, acc1; + q31_t x1; + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + ((q31_t) phaseLen - 1); + + /* Initialise blkCnt */ + blkCnt = blockSize / 2; + blkCntN2 = blockSize - (2 * blkCnt); + + /* Samples loop unrolled by 2 */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1u; + + /* Loop over the Interpolation factor. */ + i = (S->L); + + while(i > 0u) + { + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2u; + + x0 = *(ptr1++); + + while(tapCnt > 0u) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 2); + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 3); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += 4 * S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4u; + + while(tapCnt > 0u) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* update states for next sample processing */ + x0 = x1; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst = (q31_t) (acc0 >> 31); + *(pDst + S->L) = (q31_t) (acc1 >> 31); + + + pDst++; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 2; + + pDst += S->L; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blkCntN2; + + /* Loop over the blockSize. */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1u; + + /* Loop over the Interpolation factor. */ + i = S->L; + while(i > 0u) + { + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2; + while(tapCnt > 0u) + { + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen & 0x3u; + + while(tapCnt > 0u) + { + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 >> 31); + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (phaseLen - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (phaseLen - 1u) % 0x04u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + + +#else + +void arm_fir_interpolate_q31( + const arm_fir_interpolate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + + /* Run the below code for Cortex-M0 */ + + q63_t sum; /* Accumulator */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + ((q31_t) phaseLen - 1); + + /* Total number of intput samples */ + blkCnt = blockSize; + + /* Loop over the blockSize. */ + while(blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Loop over the Interpolation factor. */ + i = S->L; + + while(i > 0u) + { + /* Set accumulator to zero */ + sum = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (i - 1u); + + tapCnt = phaseLen; + + while(tapCnt > 0u) + { + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *ptr1++; + + /* Perform the multiply-accumulate */ + sum += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum >> 31); + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = phaseLen - 1u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #ifndef ARM_MATH_CM0 */ + + /** + * @} end of FIR_Interpolate group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_f32.c new file mode 100644 index 000000000..3745ffb6a --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_f32.c @@ -0,0 +1,499 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_lattice_f32.c +* +* Description: Processing function for the floating-point FIR Lattice filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR_Lattice Finite Impulse Response (FIR) Lattice Filters + * + * This set of functions implements Finite Impulse Response (FIR) lattice filters + * for Q15, Q31 and floating-point data types. Lattice filters are used in a + * variety of adaptive filter applications. The filter structure is feedforward and + * the net impulse response is finite length. + * The functions operate on blocks + * of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst point to input and output arrays containing blockSize values. + * + * \par Algorithm: + * \image html FIRLattice.gif "Finite Impulse Response Lattice filter" + * The following difference equation is implemented: + *
    
+ *    f0[n] = g0[n] = x[n]    
+ *    fm[n] = fm-1[n] + km * gm-1[n-1] for m = 1, 2, ...M    
+ *    gm[n] = km * fm-1[n] + gm-1[n-1] for m = 1, 2, ...M    
+ *    y[n] = fM[n]    
+ * 
+ * \par + * pCoeffs points to tha array of reflection coefficients of size numStages. + * Reflection Coefficients are stored in the following order. + * \par + *
    
+ *    {k1, k2, ..., kM}    
+ * 
+ * where M is number of stages + * \par + * pState points to a state array of size numStages. + * The state variables (g values) hold previous inputs and are stored in the following order. + *
    
+ *    {g0[n], g1[n], g2[n] ...gM-1[n]}    
+ * 
+ * The state variables are updated after each block of data is processed; the coefficients are untouched. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows: + *
    
+ *arm_fir_lattice_instance_f32 S = {numStages, pState, pCoeffs};    
+ *arm_fir_lattice_instance_q31 S = {numStages, pState, pCoeffs};    
+ *arm_fir_lattice_instance_q15 S = {numStages, pState, pCoeffs};    
+ * 
+ * \par + * where numStages is the number of stages in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer. + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR Lattice filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + + /** + * @brief Processing function for the floating-point FIR lattice filter. + * @param[in] *S points to an instance of the floating-point FIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_fir_lattice_f32( + const arm_fir_lattice_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *px; /* temporary state pointer */ + float32_t *pk; /* temporary coefficient pointer */ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */ + float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */ + float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */ + float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */ + uint32_t numStages = S->numStages; /* Number of stages in the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + gcurr1 = 0.0f; + pState = &S->pState[0]; + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + + /* Read two samples from input buffer */ + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + fcurr2 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* Read g0(n-1) from state */ + gcurr1 = *px; + + /* Process first sample for first tap */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (fcurr1 * (*pk)) + gcurr1; + + /* Process second sample for first tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ((*pk) * fcurr1); + gnext2 = (fcurr2 * (*pk)) + fcurr1; + + /* Read next two samples from input buffer */ + /* f0(n+2) = x(n+2) */ + fcurr3 = *pSrc++; + fcurr4 = *pSrc++; + + /* Copy only last input samples into the state buffer + which will be used for next four samples processing */ + *px++ = fcurr4; + + /* Process third sample for first tap */ + fnext3 = fcurr3 + ((*pk) * fcurr2); + gnext3 = (fcurr3 * (*pk)) + fcurr2; + + /* Process fourth sample for first tap */ + fnext4 = fcurr4 + ((*pk) * fcurr3); + gnext4 = (fcurr4 * (*pk++)) + fcurr3; + + /* Update of f values for next coefficient set processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + fcurr3 = fnext3; + fcurr4 = fnext4; + + /* Loop unrolling. Process 4 taps at a time . */ + stageCnt = (numStages - 1u) >> 2u; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numStages-3 coefficients. */ + + /* Process 2nd, 3rd, 4th and 5th taps ... here */ + while(stageCnt > 0u) + { + /* Read g1(n-1), g3(n-1) .... from state */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext4; + + /* Process first sample for 2nd, 6th .. tap */ + /* Sample processing for K2, K6.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* Process second sample for 2nd, 6th .. tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ((*pk) * gnext1); + /* Process third sample for 2nd, 6th .. tap */ + fnext3 = fcurr3 + ((*pk) * gnext2); + /* Process fourth sample for 2nd, 6th .. tap */ + fnext4 = fcurr4 + ((*pk) * gnext3); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurr1 = *px; + + /* save g2(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K3, K7.... */ + /* Process first sample for 3rd, 7th .. tap */ + /* f3(n) = f2(n) + K3 * g2(n-1) */ + fcurr1 = fnext1 + ((*pk) * gcurr1); + /* Process second sample for 3rd, 7th .. tap */ + fcurr2 = fnext2 + ((*pk) * gnext1); + /* Process third sample for 3rd, 7th .. tap */ + fcurr3 = fnext3 + ((*pk) * gnext2); + /* Process fourth sample for 3rd, 7th .. tap */ + fcurr4 = fnext4 + ((*pk) * gnext3); + + /* Calculation of state values for next stage */ + /* g3(n) = f2(n) * K3 + g2(n-1) */ + gnext4 = (fnext4 * (*pk)) + gnext3; + gnext3 = (fnext3 * (*pk)) + gnext2; + gnext2 = (fnext2 * (*pk)) + gnext1; + gnext1 = (fnext1 * (*pk++)) + gcurr1; + + + /* Read g1(n-1), g3(n-1) .... from state */ + gcurr1 = *px; + + /* save g3(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K4, K8.... */ + /* Process first sample for 4th, 8th .. tap */ + /* f4(n) = f3(n) + K4 * g3(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* Process second sample for 4th, 8th .. tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ((*pk) * gnext1); + /* Process third sample for 4th, 8th .. tap */ + fnext3 = fcurr3 + ((*pk) * gnext2); + /* Process fourth sample for 4th, 8th .. tap */ + fnext4 = fcurr4 + ((*pk) * gnext3); + + /* g4(n) = f3(n) * K4 + g3(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurr1 = *px; + + /* save g4(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K5, K9.... */ + /* Process first sample for 5th, 9th .. tap */ + /* f5(n) = f4(n) + K5 * g4(n-1) */ + fcurr1 = fnext1 + ((*pk) * gcurr1); + /* Process second sample for 5th, 9th .. tap */ + fcurr2 = fnext2 + ((*pk) * gnext1); + /* Process third sample for 5th, 9th .. tap */ + fcurr3 = fnext3 + ((*pk) * gnext2); + /* Process fourth sample for 5th, 9th .. tap */ + fcurr4 = fnext4 + ((*pk) * gnext3); + + /* Calculation of state values for next stage */ + /* g5(n) = f4(n) * K5 + g4(n-1) */ + gnext4 = (fnext4 * (*pk)) + gnext3; + gnext3 = (fnext3 * (*pk)) + gnext2; + gnext2 = (fnext2 * (*pk)) + gnext1; + gnext1 = (fnext1 * (*pk++)) + gcurr1; + + stageCnt--; + } + + /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */ + stageCnt = (numStages - 1u) % 0x4u; + + while(stageCnt > 0u) + { + gcurr1 = *px; + + /* save g value in state buffer */ + *px++ = gnext4; + + /* Process four samples for last three taps here */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + fnext2 = fcurr2 + ((*pk) * gnext1); + fnext3 = fcurr3 + ((*pk) * gnext2); + fnext4 = fcurr4 + ((*pk) * gnext3); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* Update of f values for next coefficient set processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + fcurr3 = fnext3; + fcurr4 = fnext4; + + stageCnt--; + + } + + /* The results in the 4 accumulators, store in the destination buffer. */ + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + *pDst++ = fcurr2; + *pDst++ = fcurr3; + *pDst++ = fcurr4; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* save g1(n) in state buffer */ + *px++ = fcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while(stageCnt > 0u) + { + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext1; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + + blkCnt--; + + } + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t fcurr, fnext, gcurr, gnext; /* temporary variables */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize coeff pointer */ + pk = pCoeffs; + + /* Initialize state pointer */ + px = pState; + + /* read g0(n-1) from state buffer */ + gcurr = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext = fcurr + ((*pk) * gcurr); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext = (fcurr * (*pk++)) + gcurr; + + /* save f0(n) in state buffer */ + *px++ = fcurr; + + /* f1(n) is saved in fcurr + for next stage processing */ + fcurr = fnext; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while(stageCnt > 0u) + { + /* read g2(n) from state buffer */ + gcurr = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext = fcurr + ((*pk) * gcurr); + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext = (fcurr * (*pk++)) + gcurr; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr = fnext; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr; + + blkCnt--; + + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_f32.c new file mode 100644 index 000000000..a53ae92e2 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_f32.c @@ -0,0 +1,78 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_lattice_init_f32.c +* +* Description: Floating-point FIR Lattice filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + +/** + * @brief Initialization function for the floating-point FIR lattice filter. + * @param[in] *S points to an instance of the floating-point FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] *pState points to the state buffer. The array is of length numStages. + * @return none. + */ + +void arm_fir_lattice_init_f32( + arm_fir_lattice_instance_f32 * S, + uint16_t numStages, + float32_t * pCoeffs, + float32_t * pState) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always numStages */ + memset(pState, 0, (numStages) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q15.c new file mode 100644 index 000000000..8db498509 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q15.c @@ -0,0 +1,78 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_lattice_init_q15.c +* +* Description: Q15 FIR Lattice filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q15 FIR lattice filter. + * @param[in] *S points to an instance of the Q15 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] *pState points to the state buffer. The array is of length numStages. + * @return none. + */ + +void arm_fir_lattice_init_q15( + arm_fir_lattice_instance_q15 * S, + uint16_t numStages, + q15_t * pCoeffs, + q15_t * pState) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always numStages */ + memset(pState, 0, (numStages) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q31.c new file mode 100644 index 000000000..035c5e056 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q31.c @@ -0,0 +1,78 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_lattice_init_q31.c +* +* Description: Q31 FIR lattice filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q31 FIR lattice filter. + * @param[in] *S points to an instance of the Q31 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] *pState points to the state buffer. The array is of length numStages. + * @return none. + */ + +void arm_fir_lattice_init_q31( + arm_fir_lattice_instance_q31 * S, + uint16_t numStages, + q31_t * pCoeffs, + q31_t * pState) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always numStages */ + memset(pState, 0, (numStages) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q15.c new file mode 100644 index 000000000..abfb33cf2 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q15.c @@ -0,0 +1,531 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_lattice_q15.c +* +* Description: Q15 FIR lattice filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + +/** + * @brief Processing function for the Q15 FIR lattice filter. + * @param[in] *S points to an instance of the Q15 FIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_fir_lattice_q15( + const arm_fir_lattice_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *px; /* temporary state pointer */ + q15_t *pk; /* temporary coefficient pointer */ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t fcurnt1, fnext1, gcurnt1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */ + q31_t fcurnt2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */ + q31_t fcurnt3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */ + q31_t fcurnt4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */ + uint32_t numStages = S->numStages; /* Number of stages in the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + + /* Read two samples from input buffer */ + /* f0(n) = x(n) */ + fcurnt1 = *pSrc++; + fcurnt2 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* Read g0(n-1) from state */ + gcurnt1 = *px; + + /* Process first sample for first tap */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (q31_t) ((fcurnt1 * (*pk)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* Process second sample for first tap */ + /* for sample 2 processing */ + fnext2 = (q31_t) ((fcurnt1 * (*pk)) >> 15u) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + fcurnt1; + gnext2 = __SSAT(gnext2, 16); + + + /* Read next two samples from input buffer */ + /* f0(n+2) = x(n+2) */ + fcurnt3 = *pSrc++; + fcurnt4 = *pSrc++; + + /* Copy only last input samples into the state buffer + which is used for next four samples processing */ + *px++ = (q15_t) fcurnt4; + + /* Process third sample for first tap */ + fnext3 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + fcurnt2; + gnext3 = __SSAT(gnext3, 16); + + /* Process fourth sample for first tap */ + fnext4 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + gnext4 = (q31_t) ((fcurnt4 * (*pk++)) >> 15u) + fcurnt3; + gnext4 = __SSAT(gnext4, 16); + + /* Update of f values for next coefficient set processing */ + fcurnt1 = fnext1; + fcurnt2 = fnext2; + fcurnt3 = fnext3; + fcurnt4 = fnext4; + + + /* Loop unrolling. Process 4 taps at a time . */ + stageCnt = (numStages - 1u) >> 2; + + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numStages-3 coefficients. */ + + /* Process 2nd, 3rd, 4th and 5th taps ... here */ + while(stageCnt > 0u) + { + /* Read g1(n-1), g3(n-1) .... from state */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Process first sample for 2nd, 6th .. tap */ + /* Sample processing for K2, K6.... */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + + /* Process second sample for 2nd, 6th .. tap */ + /* for sample 2 processing */ + fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + /* Process third sample for 2nd, 6th .. tap */ + fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + /* Process fourth sample for 2nd, 6th .. tap */ + /* fnext4 = fcurnt4 + (*pk) * gnext3; */ + fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15u) + gnext3; + gnext4 = __SSAT(gnext4, 16); + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + gnext2; + gnext3 = __SSAT(gnext3, 16); + + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + gnext1; + gnext2 = __SSAT(gnext2, 16); + + gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Sample processing for K3, K7.... */ + /* Process first sample for 3rd, 7th .. tap */ + /* f3(n) = f2(n) + K3 * g2(n-1) */ + fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fnext1; + fcurnt1 = __SSAT(fcurnt1, 16); + + /* Process second sample for 3rd, 7th .. tap */ + fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fnext2; + fcurnt2 = __SSAT(fcurnt2, 16); + + /* Process third sample for 3rd, 7th .. tap */ + fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fnext3; + fcurnt3 = __SSAT(fcurnt3, 16); + + /* Process fourth sample for 3rd, 7th .. tap */ + fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fnext4; + fcurnt4 = __SSAT(fcurnt4, 16); + + /* Calculation of state values for next stage */ + /* g3(n) = f2(n) * K3 + g2(n-1) */ + gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15u) + gnext3; + gnext4 = __SSAT(gnext4, 16); + + gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15u) + gnext2; + gnext3 = __SSAT(gnext3, 16); + + gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15u) + gnext1; + gnext2 = __SSAT(gnext2, 16); + + gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* Read g1(n-1), g3(n-1) .... from state */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Sample processing for K4, K8.... */ + /* Process first sample for 4th, 8th .. tap */ + /* f4(n) = f3(n) + K4 * g3(n-1) */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + /* Process second sample for 4th, 8th .. tap */ + /* for sample 2 processing */ + fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + + /* Process third sample for 4th, 8th .. tap */ + fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + + /* Process fourth sample for 4th, 8th .. tap */ + fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + + /* g4(n) = f3(n) * K4 + g3(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15u) + gnext3; + gnext4 = __SSAT(gnext4, 16); + + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + gnext2; + gnext3 = __SSAT(gnext3, 16); + + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + gnext1; + gnext2 = __SSAT(gnext2, 16); + gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurnt1 = *px; + + /* save g4(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Sample processing for K5, K9.... */ + /* Process first sample for 5th, 9th .. tap */ + /* f5(n) = f4(n) + K5 * g4(n-1) */ + fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fnext1; + fcurnt1 = __SSAT(fcurnt1, 16); + + /* Process second sample for 5th, 9th .. tap */ + fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fnext2; + fcurnt2 = __SSAT(fcurnt2, 16); + + /* Process third sample for 5th, 9th .. tap */ + fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fnext3; + fcurnt3 = __SSAT(fcurnt3, 16); + + /* Process fourth sample for 5th, 9th .. tap */ + fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fnext4; + fcurnt4 = __SSAT(fcurnt4, 16); + + /* Calculation of state values for next stage */ + /* g5(n) = f4(n) * K5 + g4(n-1) */ + gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15u) + gnext3; + gnext4 = __SSAT(gnext4, 16); + gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15u) + gnext2; + gnext3 = __SSAT(gnext3, 16); + gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15u) + gnext1; + gnext2 = __SSAT(gnext2, 16); + gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + stageCnt--; + } + + /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */ + stageCnt = (numStages - 1u) % 0x4u; + + while(stageCnt > 0u) + { + gcurnt1 = *px; + + /* save g value in state buffer */ + *px++ = (q15_t) gnext4; + + /* Process four samples for last three taps here */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + + fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + + fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15u) + gnext3; + gnext4 = __SSAT(gnext4, 16); + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + gnext2; + gnext3 = __SSAT(gnext3, 16); + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + gnext1; + gnext2 = __SSAT(gnext2, 16); + gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* Update of f values for next coefficient set processing */ + fcurnt1 = fnext1; + fcurnt2 = fnext2; + fcurnt3 = fnext3; + fcurnt4 = fnext4; + + stageCnt--; + + } + + /* The results in the 4 accumulators, store in the destination buffer. */ + /* y(n) = fN(n) */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = __PKHBT(fcurnt1, fcurnt2, 16); + *__SIMD32(pDst)++ = __PKHBT(fcurnt3, fcurnt4, 16); + +#else + + *__SIMD32(pDst)++ = __PKHBT(fcurnt2, fcurnt1, 16); + *__SIMD32(pDst)++ = __PKHBT(fcurnt4, fcurnt3, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurnt1 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g2(n) from state buffer */ + gcurnt1 = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15u) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* save g1(n) in state buffer */ + *px++ = (q15_t) fcurnt1; + + /* f1(n) is saved in fcurnt1 + for next stage processing */ + fcurnt1 = fnext1; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while(stageCnt > 0u) + { + /* read g2(n) from state buffer */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext1; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15u) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15u) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + + /* f1(n) is saved in fcurnt1 + for next stage processing */ + fcurnt1 = fnext1; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = __SSAT(fcurnt1, 16); + + + blkCnt--; + + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t fcurnt, fnext, gcurnt, gnext; /* temporary variables */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurnt = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n-1) from state buffer */ + gcurnt = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext = ((gcurnt * (*pk)) >> 15u) + fcurnt; + fnext = __SSAT(fnext, 16); + + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext = ((fcurnt * (*pk++)) >> 15u) + gcurnt; + gnext = __SSAT(gnext, 16); + + /* save f0(n) in state buffer */ + *px++ = (q15_t) fcurnt; + + /* f1(n) is saved in fcurnt + for next stage processing */ + fcurnt = fnext; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while(stageCnt > 0u) + { + /* read g1(n-1) from state buffer */ + gcurnt = *px; + + /* save g0(n-1) in state buffer */ + *px++ = (q15_t) gnext; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext = ((gcurnt * (*pk)) >> 15u) + fcurnt; + fnext = __SSAT(fnext, 16); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext = ((fcurnt * (*pk++)) >> 15u) + gcurnt; + gnext = __SSAT(gnext, 16); + + + /* f1(n) is saved in fcurnt + for next stage processing */ + fcurnt = fnext; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = __SSAT(fcurnt, 16); + + + blkCnt--; + + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q31.c new file mode 100644 index 000000000..9fb46459d --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q31.c @@ -0,0 +1,348 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_lattice_q31.c +* +* Description: Q31 FIR lattice filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + +/** + * @brief Processing function for the Q31 FIR lattice filter. + * @param[in] *S points to an instance of the Q31 FIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * In order to avoid overflows the input signal must be scaled down by 2*log2(numStages) bits. + */ + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_lattice_q31( + const arm_fir_lattice_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *px; /* temporary state pointer */ + q31_t *pk; /* temporary coefficient pointer */ + q31_t fcurr1, fnext1, gcurr1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */ + q31_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + q31_t k; + + pState = &S->pState[0]; + + blkCnt = blockSize >> 1u; + + /* First part of the processing with loop unrolling. Compute 2 outputs at a time. + a second loop below computes the remaining 1 sample. */ + while(blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + + /* f0(n) = x(n) */ + fcurr2 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n - 1) from state buffer */ + gcurr1 = *px; + + /* Read the reflection coefficient */ + k = *pk++; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + fnext1 = fcurr1 + (fnext1 << 1u); + gnext1 = gcurr1 + (gnext1 << 1u); + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext2 = (q31_t) (((q63_t) fcurr1 * k) >> 32); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32); + fnext2 = fcurr2 + (fnext2 << 1u); + gnext2 = fcurr1 + (gnext2 << 1u); + + /* save g1(n) in state buffer */ + *px++ = fcurr2; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while(stageCnt > 0u) + { + + /* Read the reflection coefficient */ + k = *pk++; + + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext2; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + fnext2 = (q31_t) (((q63_t) gnext1 * k) >> 32); + + fnext1 = fcurr1 + (fnext1 << 1u); + fnext2 = fcurr2 + (fnext2 << 1u); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32); + gnext2 = gnext1 + (gnext2 << 1u); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + gnext1 = gcurr1 + (gnext1 << 1u); + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + *pDst++ = fcurr2; + + blkCnt--; + + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x2u; + + while(blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n - 1) from state buffer */ + gcurr1 = *px; + + /* Read the reflection coefficient */ + k = *pk++; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + fnext1 = fcurr1 + (fnext1 << 1u); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + gnext1 = gcurr1 + (gnext1 << 1u); + + /* save g1(n) in state buffer */ + *px++ = fcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while(stageCnt > 0u) + { + /* Read the reflection coefficient */ + k = *pk++; + + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext1; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + fnext1 = fcurr1 + (fnext1 << 1u); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + gnext1 = gcurr1 + (gnext1 << 1u); + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt--; + + } + + + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + + blkCnt--; + + } + + +} + + +#else + +/* Run the below code for Cortex-M0 */ + +void arm_fir_lattice_q31( + const arm_fir_lattice_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *px; /* temporary state pointer */ + q31_t *pk; /* temporary coefficient pointer */ + q31_t fcurr, fnext, gcurr, gnext; /* temporary variables */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n-1) from state buffer */ + gcurr = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr; + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr; + /* save g1(n) in state buffer */ + *px++ = fcurr; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr = fnext; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while(stageCnt > 0u) + { + /* read g2(n) from state buffer */ + gcurr = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr; + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr = fnext; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr; + + blkCnt--; + + } + +} + +#endif /* #ifndef ARM_MATH_CM0 */ + + +/** + * @} end of FIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q15.c new file mode 100644 index 000000000..368014f43 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q15.c @@ -0,0 +1,689 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_q15.c +* +* Description: Q15 FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR filter. + * @param[in] *S points to an instance of the Q15 FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, state buffers should be aligned by 32-bit + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + * + * \par + * Refer to the function arm_fir_fast_q15() for a faster but less precise implementation of this function. + */ + +#ifndef ARM_MATH_CM0 + +/* Run the below code for Cortex-M4 and Cortex-M3 */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px1; /* Temporary q15 pointer for state buffer */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer of type q15 */ + px1 = pState; + + /* Initialize coeff pointer of type q31 */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = _SIMD32_OFFSET(px1); + + /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */ + x1 = _SIMD32_OFFSET(px1 + 1u); + + px1 += 2u; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0u) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLALD(x1, c0, acc1); + + /* Read state x[n-N-2], x[n-N-3] */ + x2 = _SIMD32_OFFSET(px1); + + /* Read state x[n-N-3], x[n-N-4] */ + x3 = _SIMD32_OFFSET(px1 + 1u); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLALD(x3, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLALD(x3, c0, acc1); + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px1 + 2u); + + /* Read state x[n-N-5], x[n-N-6] */ + x1 = _SIMD32_OFFSET(px1 + 3u); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLALD(x1, c0, acc3); + + px1 += 4u; + + tapCnt--; + + } + + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if((numTaps & 0x3u) != 0u) + { + /* Read 2 coefficients */ + c0 = *__SIMD32(pb)++; + + /* Fetch 4 state variables */ + x2 = _SIMD32_OFFSET(px1); + + x3 = _SIMD32_OFFSET(px1 + 1u); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + + px1 += 2u; + + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALD(x2, c0, acc2); + acc3 = __SMLALD(x3, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + while(blkCnt > 0u) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer of type q15 */ + px1 = pState; + + /* Initialize coeff pointer of type q31 */ + pb = pCoeffs; + + tapCnt = numTaps >> 1; + + do + { + + c0 = *__SIMD32(pb)++; + x0 = *__SIMD32(px1)++; + + acc0 = __SMLALD(x0, c0, acc0); + tapCnt--; + } + while(tapCnt > 0u); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1u) >> 2; + + while(tapCnt > 0u) + { + + /* Copy state values to start of state buffer */ + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* copy remaining data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else /* UNALIGNED_SUPPORT_DISABLE */ + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ + px = pState; + + /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = *__SIMD32(px)++; + + /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ + x2 = *__SIMD32(px)++; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* pack x[n-N-1] and x[n-N-2] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack x[n-N-3] and x[n-N-4] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* Read state x[n-N-6], x[n-N-7] with offset */ + x2 = _SIMD32_OFFSET(px + 2u); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack x[n-N-5] and x[n-N-6] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLALDX(x1, c0, acc3); + + /* Update state pointer for next state reading */ + px += 4u; + + /* Decrement tap count */ + tapCnt--; + + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if((numTaps & 0x3u) != 0u) + { + + /* Read last two coefficients */ + c0 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc2 = __SMLALD(x2, c0, acc2); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read last state variables */ + x0 = *__SIMD32(px); + + /* Perform the multiply-accumulates */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* Perform the multiply-accumulates */ + acc3 = __SMLALDX(x1, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + while(blkCnt > 0u) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Use SIMD to hold states and coefficients */ + px = pState; + pb = pCoeffs; + + tapCnt = numTaps >> 1u; + + do + { + acc0 += (q31_t) * px++ * *pb++; + acc0 += (q31_t) * px++ * *pb++; + tapCnt--; + } + while(tapCnt > 0u); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1u; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1u) >> 2; + + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* copy remaining data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#else /* ARM_MATH_CM0 */ + + +/* Run the below code for Cortex-M0 */ + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + + + + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q31_t) * px++ * *pb++; + tapCnt--; + } while(tapCnt > 0u); + + /* The result is in 2.30 format. Convert to 1.15 + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) __SSAT((acc >> 15u), 16); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = (numTaps - 1u); + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #ifndef ARM_MATH_CM0 */ + + + + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c new file mode 100644 index 000000000..b3b84ec51 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c @@ -0,0 +1,363 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_q31.c +* +* Description: Q31 FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q31 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. + * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * Refer to the function arm_fir_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4. + */ + +void arm_fir_q31( + const arm_fir_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t x0, x1, x2; /* Temporary variables to hold state */ + q31_t c0; /* Temporary variable to hold coefficient value */ + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc0, acc1, acc2; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt, tapCntN3; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize / 3; + blockSize = blockSize - (3 * blkCnt); + + tapCnt = numTaps / 3; + tapCntN3 = numTaps - (3 * tapCnt); + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy three new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1] */ + x0 = *(px++); + x1 = *(px++); + + /* Loop unrolling. Process 3 taps at a time. */ + i = tapCnt; + + while(i > 0u) + { + /* Read the b[numTaps] coefficient */ + c0 = *pb; + + /* Read x[n-numTaps-2] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x0 * c0); + acc1 += ((q63_t) x1 * c0); + acc2 += ((q63_t) x2 * c0); + + /* Read the coefficient and state */ + c0 = *(pb + 1u); + x0 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x1 * c0); + acc1 += ((q63_t) x2 * c0); + acc2 += ((q63_t) x0 * c0); + + /* Read the coefficient and state */ + c0 = *(pb + 2u); + x1 = *(px++); + + /* update coefficient pointer */ + pb += 3u; + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x2 * c0); + acc1 += ((q63_t) x0 * c0); + acc2 += ((q63_t) x1 * c0); + + /* Decrement the loop counter */ + i--; + } + + /* If the filter length is not a multiple of 3, compute the remaining filter taps */ + + i = tapCntN3; + + while(i > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x0 * c0); + acc1 += ((q63_t) x1 * c0); + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 3 to process the next group of 3 samples */ + pState = pState + 3; + + /* The results in the 3 accumulators are in 2.30 format. Convert to 1.31 + ** Then store the 3 outputs in the destination buffer. */ + *pDst++ = (q31_t) (acc0 >> 31u); + *pDst++ = (q31_t) (acc1 >> 31u); + *pDst++ = (q31_t) (acc2 >> 31u); + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + + while(blockSize > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += (q63_t) * (px++) * (*(pb++)); + i--; + } while(i > 0u); + + /* The result is in 2.62 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc0 >> 31u); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blockSize--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Length of the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q63_t) * px++ * *pb++; + i--; + } while(i > 0u); + + /* The result is in 2.62 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc >> 31u); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the starting of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = numTaps - 1u; + + /* Copy the data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c new file mode 100644 index 000000000..624afa130 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c @@ -0,0 +1,388 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_q7.c +* +* Description: Q7 FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated. +* +* Version 0.0.5 2010/04/26 +* incorporated review comments and updated with latest CMSIS layer +* +* Version 0.0.3 2010/03/10 +* Initial version +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q7 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * The accumulator is converted to 18.7 format by discarding the low 7 bits. + * Finally, the result is truncated to 1.7 format. + */ + +void arm_fir_q7( + const arm_fir_instance_q7 * S, + q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *pStateCurnt; /* Points to the current sample of the state */ + q7_t x0, x1, x2, x3; /* Temporary variables to hold state */ + q7_t c0; /* Temporary variable to hold coefficient value */ + q7_t *px; /* Temporary pointer for state */ + q7_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first three samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + i = tapCnt; + + while(i > 0u) + { + /* Read the b[numTaps] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x3 = *(px++); + + /* acc0 += b[numTaps] * x[n-numTaps] */ + acc0 += ((q15_t) x0 * c0); + + /* acc1 += b[numTaps] * x[n-numTaps-1] */ + acc1 += ((q15_t) x1 * c0); + + /* acc2 += b[numTaps] * x[n-numTaps-2] */ + acc2 += ((q15_t) x2 * c0); + + /* acc3 += b[numTaps] * x[n-numTaps-3] */ + acc3 += ((q15_t) x3 * c0); + + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x1 * c0); + acc1 += ((q15_t) x2 * c0); + acc2 += ((q15_t) x3 * c0); + acc3 += ((q15_t) x0 * c0); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x2 * c0); + acc1 += ((q15_t) x3 * c0); + acc2 += ((q15_t) x0 * c0); + acc3 += ((q15_t) x1 * c0); + /* Read the b[numTaps-3] coefficients */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x3 * c0); + acc1 += ((q15_t) x0 * c0); + acc2 += ((q15_t) x1 * c0); + acc3 += ((q15_t) x2 * c0); + i--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + + i = numTaps - (tapCnt * 4u); + while(i > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x0 * c0); + acc1 += ((q15_t) x1 * c0); + acc2 += ((q15_t) x2 * c0); + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31 + ** Then store the 4 outputs in the destination buffer. */ + acc0 = __SSAT((acc0 >> 7u), 8); + *pDst++ = acc0; + acc1 = __SSAT((acc1 >> 7u), 8); + *pDst++ = acc1; + acc2 = __SSAT((acc2 >> 7u), 8); + *pDst++ = acc2; + acc3 = __SSAT((acc3 >> 7u), 8); + *pDst++ = acc3; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 4u; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += (q15_t) * (px++) * (*(pb++)); + i--; + } while(i > 0u); + + /* The result is in 2.14 format. Convert to 1.7 + ** Then store the output in the destination buffer. */ + *pDst++ = __SSAT((acc0 >> 7u), 8); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t i, blkCnt; /* Loop counters */ + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *px, *pb; /* Temporary pointers to state and coeff */ + q31_t acc = 0; /* Accumlator */ + q7_t *pStateCurnt; /* Points to the current sample of the state */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + /* Perform filtering upto BlockSize - BlockSize%4 */ + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set accumulator to zero */ + acc = 0; + + /* Initialize state pointer of type q7 */ + px = pState; + + /* Initialize coeff pointer of type q7 */ + pb = pCoeffs; + + + i = numTaps; + + while(i > 0u) + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q15_t) * px++ * *pb++; + i--; + } + + /* Store the 1.7 format filter output in destination buffer */ + *pDst++ = (q7_t) __SSAT((acc >> 7), 8); + + /* Advance the state pointer by 1 to process the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + + /* Copy numTaps number of values */ + i = (numTaps - 1u); + + /* Copy q7_t data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + i--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_f32.c new file mode 100644 index 000000000..e969d5c84 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_f32.c @@ -0,0 +1,365 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_f32.c +* +* Description: Floating-point sparse FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ------------------------------------------------------------------- */ +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR_Sparse Finite Impulse Response (FIR) Sparse Filters + * + * This group of functions implements sparse FIR filters. + * Sparse FIR filters are equivalent to standard FIR filters except that most of the coefficients are equal to zero. + * Sparse filters are used for simulating reflections in communications and audio applications. + * + * There are separate functions for Q7, Q15, Q31, and floating-point data types. + * The functions operate on blocks of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst points to input and output arrays respectively containing blockSize values. + * + * \par Algorithm: + * The sparse filter instant structure contains an array of tap indices pTapDelay which specifies the locations of the non-zero coefficients. + * This is in addition to the coefficient array b. + * The implementation essentially skips the multiplications by zero and leads to an efficient realization. + *
   
+ *     y[n] = b[0] * x[n-pTapDelay[0]] + b[1] * x[n-pTapDelay[1]] + b[2] * x[n-pTapDelay[2]] + ...+ b[numTaps-1] * x[n-pTapDelay[numTaps-1]]    
+ * 
+ * \par + * \image html FIRSparse.gif "Sparse FIR filter. b[n] represents the filter coefficients" + * \par + * pCoeffs points to a coefficient array of size numTaps; + * pTapDelay points to an array of nonzero indices and is also of size numTaps; + * pState points to a state array of size maxDelay + blockSize, where + * maxDelay is the largest offset value that is ever used in the pTapDelay array. + * Some of the processing functions also require temporary working buffers. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient and offset arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 4 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 4 different data type filter instance structures + *
    
+ *arm_fir_sparse_instance_f32 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};    
+ *arm_fir_sparse_instance_q31 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};    
+ *arm_fir_sparse_instance_q15 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};    
+ *arm_fir_sparse_instance_q7 S =  {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};    
+ * 
+ * \par + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the sparse FIR filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Processing function for the floating-point sparse FIR filter. + * @param[in] *S points to an instance of the floating-point sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ + +void arm_fir_sparse_f32( + arm_fir_sparse_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + float32_t * pScratchIn, + uint32_t blockSize) +{ + + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *px; /* Scratch buffer pointer */ + float32_t *py = pState; /* Temporary pointers for state buffer */ + float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + float32_t *pOut; /* Destination pointer */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + float32_t coeff = *pCoeffs++; /* Read the first coefficient value */ + + + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, + (int32_t *) pSrc, 1, blockSize); + + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 Multiplications at a time. */ + blkCnt = blockSize >> 2u; + + while(blkCnt > 0u) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2u; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - + (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = + ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_f32.c new file mode 100644 index 000000000..2934352d0 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_f32.c @@ -0,0 +1,102 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_init_f32.c +* +* Description: Floating-point sparse FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the floating-point sparse FIR filter. + * @param[in,out] *S points to an instance of the floating-point sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the + * number of samples processed by the arm_fir_sparse_f32() function. + */ + +void arm_fir_sparse_init_f32( + arm_fir_sparse_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0u; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q15.c new file mode 100644 index 000000000..47a116e95 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q15.c @@ -0,0 +1,102 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_init_q15.c +* +* Description: Q15 sparse FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the Q15 sparse FIR filter. + * @param[in,out] *S points to an instance of the Q15 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the + * number of words processed by arm_fir_sparse_q15() function. + */ + +void arm_fir_sparse_init_q15( + arm_fir_sparse_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0u; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q31.c new file mode 100644 index 000000000..360a219b2 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q31.c @@ -0,0 +1,101 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_init_q31.c +* +* Description: Q31 sparse FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the Q31 sparse FIR filter. + * @param[in,out] *S points to an instance of the Q31 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the number of words processed by arm_fir_sparse_q31() function. + */ + +void arm_fir_sparse_init_q31( + arm_fir_sparse_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0u; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q7.c new file mode 100644 index 000000000..b400297ca --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q7.c @@ -0,0 +1,102 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_init_q7.c +* +* Description: Q7 sparse FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the Q7 sparse FIR filter. + * @param[in,out] *S points to an instance of the Q7 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the + * number of samples processed by the arm_fir_sparse_q7() function. + */ + +void arm_fir_sparse_init_q7( + arm_fir_sparse_instance_q7 * S, + uint16_t numTaps, + q7_t * pCoeffs, + q7_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0u; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(q7_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q15.c new file mode 100644 index 000000000..016f83345 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q15.c @@ -0,0 +1,406 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_q15.c +* +* Description: Q15 sparse FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ------------------------------------------------------------------- */ +#include "arm_math.h" + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Processing function for the Q15 sparse FIR filter. + * @param[in] *S points to an instance of the Q15 sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] *pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 32-bit accumulator. + * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. + * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. + * If the accumulator result overflows it will wrap around rather than saturate. + * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. + * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. + */ + + +void arm_fir_sparse_q15( + arm_fir_sparse_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + q15_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize) +{ + + q15_t *pState = S->pState; /* State pointer */ + q15_t *pIn = pSrc; /* Working pointer for input */ + q15_t *pOut = pDst; /* Working pointer for output */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *px; /* Temporary pointers for scratch buffer */ + q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q15_t *py = pState; /* Temporary pointers for state buffer */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Filter order */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ + q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t in1, in2; /* Temporary variables */ + + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 multiplications at a time. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + in1 = *pScr2++; + in2 = *pScr2++; + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), + 16); + +#else + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), + 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + in1 = *pScr2++; + + in2 = *pScr2++; + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), + 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), + 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + blkCnt--; + + } + + /* If the blockSize is not a multiple of 4, + remaining samples are processed in the below loop */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); + blkCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); + blkCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q31.c new file mode 100644 index 000000000..3d2f6d402 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q31.c @@ -0,0 +1,370 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_q31.c +* +* Description: Q31 sparse FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ------------------------------------------------------------------- */ +#include "arm_math.h" + + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Processing function for the Q31 sparse FIR filter. + * @param[in] *S points to an instance of the Q31 sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 32-bit accumulator. + * The 1.31 x 1.31 multiplications are truncated to 2.30 format. + * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit. + * If the accumulator result overflows, it wraps around rather than saturate. + * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. + */ + +void arm_fir_sparse_q31( + arm_fir_sparse_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + q31_t * pScratchIn, + uint32_t blockSize) +{ + + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *px; /* Scratch buffer pointer */ + q31_t *py = pState; /* Temporary pointers for state buffer */ + q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q31_t *pOut; /* Destination pointer */ + q63_t out; /* Temporary output variable */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Filter order */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q31_t coeff = *pCoeffs++; /* Read the first coefficient value */ + q31_t in; + + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, + (int32_t *) pSrc, 1, blockSize); + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 Multiplications at a time. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + /* Perform Multiplications and store in the destination buffer */ + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform Multiplications and store in the destination buffer */ + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Working output pointer is updated */ + pOut = pDst; + + /* Output is converted into 1.31 format. */ + /* Loop over the blockSize. Unroll by a factor of 4. + * process 4 output samples at a time. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + in = *pOut << 1; + *pOut++ = in; + in = *pOut << 1; + *pOut++ = in; + in = *pOut << 1; + *pOut++ = in; + in = *pOut << 1; + *pOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * process the remaining output samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + in = *pOut << 1; + *pOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform Multiplications and store in the destination buffer */ + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Working output pointer is updated */ + pOut = pDst; + + /* Output is converted into 1.31 format. */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + in = *pOut << 1; + *pOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q7.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q7.c new file mode 100644 index 000000000..ace5d0703 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q7.c @@ -0,0 +1,398 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_fir_sparse_q7.c +* +* Description: Q7 sparse FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ------------------------------------------------------------------- */ +#include "arm_math.h" + + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + + +/** + * @brief Processing function for the Q7 sparse FIR filter. + * @param[in] *S points to an instance of the Q7 sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] *pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * The accumulator is then converted to 18.7 format by discarding the low 7 bits. + * Finally, the result is truncated to 1.7 format. + */ + +void arm_fir_sparse_q7( + arm_fir_sparse_instance_q7 * S, + q7_t * pSrc, + q7_t * pDst, + q7_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize) +{ + + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *px; /* Scratch buffer pointer */ + q7_t *py = pState; /* Temporary pointers for state buffer */ + q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q7_t *pOut = pDst; /* Destination pointer */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Filter order */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q7_t coeff = *pCoeffs++; /* Read the coefficient value */ + q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */ + q31_t in; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t in1, in2, in3, in4; + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, + blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 multiplications at a time. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - + (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize >> 2; + + while(blkCnt > 0u) + { + in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + + *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + remaining samples are processed in the below loop */ + blkCnt = blockSize % 0x4u; + + while(blkCnt > 0u) + { + *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, + blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 1u; + + while(tapCnt > 0u) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = + ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if(readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_f32.c new file mode 100644 index 000000000..074605553 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_f32.c @@ -0,0 +1,440 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_iir_lattice_f32.c +* +* Description: Floating-point IIR Lattice filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters + * + * This set of functions implements lattice filters + * for Q15, Q31 and floating-point data types. Lattice filters are used in a + * variety of adaptive filter applications. The filter structure has feedforward and + * feedback components and the net impulse response is infinite length. + * The functions operate on blocks + * of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst point to input and output arrays containing blockSize values. + + * \par Algorithm: + * \image html IIRLattice.gif "Infinite Impulse Response Lattice filter" + *
    
+ *    fN(n)   =  x(n)    
+ *    fm-1(n) = fm(n) - km * gm-1(n-1)   for m = N, N-1, ...1    
+ *    gm(n)   = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ...1    
+ *    y(n)    = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)    
+ * 
+ * \par + * pkCoeffs points to array of reflection coefficients of size numStages. + * Reflection coefficients are stored in time-reversed order. + * \par + *
    
+ *    {kN, kN-1, ....k1}    
+ * 
+ * pvCoeffs points to the array of ladder coefficients of size (numStages+1). + * Ladder coefficients are stored in time-reversed order. + * \par + *
    
+ *    {vN, vN-1, ...v0}    
+ * 
+ * pState points to a state array of size numStages + blockSize. + * The state variables shown in the figure above (the g values) are stored in the pState array. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows: + *
    
+ *arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};    
+ *arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};    
+ *arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};    
+ * 
+ * \par + * where numStages is the number of stages in the filter; pState points to the state buffer array; + * pkCoeffs points to array of the reflection coefficients; pvCoeffs points to the array of ladder coefficients. + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the IIR lattice filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Processing function for the floating-point IIR lattice filter. + * @param[in] *S points to an instance of the floating-point IIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_iir_lattice_f32( + const arm_iir_lattice_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t fnext1, gcurr1, gnext; /* Temporary variables for lattice stages */ + float32_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* temporary variables for counts */ + float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + float32_t *pState; /* State pointer */ + float32_t *pStateCurnt; /* State current pointer */ + float32_t k1, k2; + float32_t v1, v2, v3, v4; + float32_t gcurr2; + float32_t fnext2; + + /* initialise loop count */ + blkCnt = blockSize; + + /* initialise state pointer */ + pState = &S->pState[0]; + + /* Sample processing */ + while(blkCnt > 0u) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fnext2 = *pSrc++; + + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + + /* Set accumulator to zero */ + acc = 0.0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = (numStages) >> 2; + + while(tapCnt > 0u) + { + /* Read gN-1(n-1) from state buffer */ + gcurr1 = *px1; + + /* read reflection coefficient kN */ + k1 = *pk; + + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext1 = fnext2 - (k1 * gcurr1); + + /* read ladder coefficient vN */ + v1 = *pv; + + /* read next reflection coefficient kN-1 */ + k2 = *(pk + 1u); + + /* Read gN-2(n-1) from state buffer */ + gcurr2 = *(px1 + 1u); + + /* read next ladder coefficient vN-1 */ + v2 = *(pv + 1u); + + /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */ + fnext2 = fnext1 - (k2 * gcurr2); + + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = gcurr1 + (k1 * fnext1); + + /* read reflection coefficient kN-2 */ + k1 = *(pk + 2u); + + /* write gN(n) into state for next sample processing */ + *px2++ = gnext; + + /* Read gN-3(n-1) from state buffer */ + gcurr1 = *(px1 + 2u); + + /* y(n) += gN(n) * vN */ + acc += (gnext * v1); + + /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */ + fnext1 = fnext2 - (k1 * gcurr1); + + /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */ + gnext = gcurr2 + (k2 * fnext2); + + /* Read gN-4(n-1) from state buffer */ + gcurr2 = *(px1 + 3u); + + /* y(n) += gN-1(n) * vN-1 */ + acc += (gnext * v2); + + /* read reflection coefficient kN-3 */ + k2 = *(pk + 3u); + + /* write gN-1(n) into state for next sample processing */ + *px2++ = gnext; + + /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */ + fnext2 = fnext1 - (k2 * gcurr2); + + /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */ + gnext = gcurr1 + (k1 * fnext1); + + /* read ladder coefficient vN-2 */ + v3 = *(pv + 2u); + + /* y(n) += gN-2(n) * vN-2 */ + acc += (gnext * v3); + + /* write gN-2(n) into state for next sample processing */ + *px2++ = gnext; + + /* update pointer */ + pk += 4u; + + /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */ + gnext = (fnext2 * k2) + gcurr2; + + /* read next ladder coefficient vN-3 */ + v4 = *(pv + 3u); + + /* y(n) += gN-4(n) * vN-4 */ + acc += (gnext * v4); + + /* write gN-3(n) into state for next sample processing */ + *px2++ = gnext; + + /* update pointers */ + px1 += 4u; + pv += 4u; + + tapCnt--; + + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = (numStages) % 0x4u; + + while(tapCnt > 0u) + { + gcurr1 = *px1++; + /* Process sample for last taps */ + fnext1 = fnext2 - ((*pk) * gcurr1); + gnext = (fnext1 * (*pk++)) + gcurr1; + /* Output samples for last taps */ + acc += (gnext * (*pv++)); + *px2++ = gnext; + fnext2 = fnext1; + + tapCnt--; + + } + + /* y(n) += g0(n) * v0 */ + acc += (fnext2 * (*pv)); + + *px2++ = fnext2; + + /* write out into pDst */ + *pDst++ = acc; + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 1u; + + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + + } + + /* Calculate remaining number of copies */ + tapCnt = (numStages) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else + +void arm_iir_lattice_f32( + const arm_iir_lattice_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t fcurr, fnext = 0, gcurr, gnext; /* Temporary variables for lattice stages */ + float32_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* temporary variables for counts */ + float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + float32_t *pState; /* State pointer */ + float32_t *pStateCurnt; /* State current pointer */ + + + /* Run the below code for Cortex-M0 */ + + blkCnt = blockSize; + + pState = &S->pState[0]; + + /* Sample processing */ + while(blkCnt > 0u) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0.0f; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + + /* Process sample for numStages */ + tapCnt = numStages; + + while(tapCnt > 0u) + { + gcurr = *px1++; + /* Process sample for last taps */ + fnext = fcurr - ((*pk) * gcurr); + gnext = (fnext * (*pk++)) + gcurr; + + /* Output samples for last taps */ + acc += (gnext * (*pv++)); + *px2++ = gnext; + fcurr = fnext; + + /* Decrementing loop counter */ + tapCnt--; + + } + + /* y(n) += g0(n) * v0 */ + acc += (fnext * (*pv)); + + *px2++ = fnext; + + /* write out into pDst */ + *pDst++ = acc; + + /* Advance the state pointer by 1 to process the next group of samples */ + pState = pState + 1u; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages; + + /* Copy the data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #ifndef ARM_MATH_CM0 */ + + +/** + * @} end of IIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_f32.c new file mode 100644 index 000000000..89c00c24d --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_f32.c @@ -0,0 +1,86 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_iir_lattice_init_f32.c +* +* Description: Floating-point IIR lattice filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Initialization function for the floating-point IIR lattice filter. + * @param[in] *S points to an instance of the floating-point IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_iir_lattice_init_f32( + arm_iir_lattice_instance_f32 * S, + uint16_t numStages, + float32_t * pkCoeffs, + float32_t * pvCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign reflection coefficient pointer */ + S->pkCoeffs = pkCoeffs; + + /* Assign ladder coefficient pointer */ + S->pvCoeffs = pvCoeffs; + + /* Clear state buffer and size is always blockSize + numStages */ + memset(pState, 0, (numStages + blockSize) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + +} + + /** + * @} end of IIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q15.c new file mode 100644 index 000000000..7dac99fca --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q15.c @@ -0,0 +1,86 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_iir_lattice_init_q15.c +* +* Description: Q15 IIR lattice filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q15 IIR lattice filter. + * @param[in] *S points to an instance of the Q15 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] *pkCoeffs points to reflection coefficient buffer. The array is of length numStages. + * @param[in] *pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1. + * @param[in] *pState points to state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process per call. + * @return none. + */ + +void arm_iir_lattice_init_q15( + arm_iir_lattice_instance_q15 * S, + uint16_t numStages, + q15_t * pkCoeffs, + q15_t * pvCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign reflection coefficient pointer */ + S->pkCoeffs = pkCoeffs; + + /* Assign ladder coefficient pointer */ + S->pvCoeffs = pvCoeffs; + + /* Clear state buffer and size is always blockSize + numStages */ + memset(pState, 0, (numStages + blockSize) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + +} + +/** + * @} end of IIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q31.c new file mode 100644 index 000000000..73b18a597 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q31.c @@ -0,0 +1,86 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_iir_lattice_init_q31.c +* +* Description: Initialization function for the Q31 IIR lattice filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q31 IIR lattice filter. + * @param[in] *S points to an instance of the Q31 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_iir_lattice_init_q31( + arm_iir_lattice_instance_q31 * S, + uint16_t numStages, + q31_t * pkCoeffs, + q31_t * pvCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign reflection coefficient pointer */ + S->pkCoeffs = pkCoeffs; + + /* Assign ladder coefficient pointer */ + S->pvCoeffs = pvCoeffs; + + /* Clear state buffer and size is always blockSize + numStages */ + memset(pState, 0, (numStages + blockSize) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + +} + +/** + * @} end of IIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q15.c new file mode 100644 index 000000000..5f0f891d1 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q15.c @@ -0,0 +1,457 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_iir_lattice_q15.c +* +* Description: Q15 IIR lattice filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Processing function for the Q15 IIR lattice filter. + * @param[in] *S points to an instance of the Q15 IIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + */ + +void arm_iir_lattice_q15( + const arm_iir_lattice_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t fcurr, fnext, gcurr = 0, gnext; /* Temporary variables for lattice stages */ + q15_t gnext1, gnext2; /* Temporary variables for lattice stages */ + uint32_t stgCnt; /* Temporary variables for counts */ + q63_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* Temporary variables for counts */ + q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + q15_t *pState; /* State pointer */ + q15_t *pStateCurnt; /* State current pointer */ + q15_t out; /* Temporary variable for output */ + q15_t v1, v2; + q31_t v; /* Temporary variable for ladder coefficient */ + + + blkCnt = blockSize; + + pState = &S->pState[0]; + + /* Sample processing */ + while(blkCnt > 0u) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + + /* Process sample for first tap */ + gcurr = *px1++; + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext = __SSAT(gnext, 16); + /* write gN(n) into state for next sample processing */ + *px2++ = (q15_t) gnext; + /* y(n) += gN(n) * vN */ + acc += (q31_t) ((gnext * (*pv++))); + + + /* Update f values for next coefficient processing */ + fcurr = fnext; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = (numStages - 1u) >> 2; + + while(tapCnt > 0u) + { + + /* Process sample for 2nd, 6th ...taps */ + /* Read gN-2(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 2nd, 6th .. taps */ + /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */ + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext1 = (q15_t) __SSAT(gnext, 16); + /* write gN-1(n) into state */ + *px2++ = (q15_t) gnext1; + + + /* Process sample for 3nd, 7th ...taps */ + /* Read gN-3(n-1) from state */ + gcurr = *px1++; + /* Process sample for 3rd, 7th .. taps */ + /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */ + fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15); + fcurr = __SSAT(fcurr, 16); + /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */ + gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr; + gnext2 = (q15_t) __SSAT(gnext, 16); + /* write gN-2(n) into state */ + *px2++ = (q15_t) gnext2; + + /* Read vN-1 and vN-2 at a time */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + v = *__SIMD32(pv)++; + +#else + + v1 = *pv++; + v2 = *pv++; + +#ifndef ARM_MATH_BIG_ENDIAN + + v = __PKHBT(v1, v2, 16); + +#else + + v = __PKHBT(v2, v1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + + /* Pack gN-1(n) and gN-2(n) */ + +#ifndef ARM_MATH_BIG_ENDIAN + + gnext = __PKHBT(gnext1, gnext2, 16); + +#else + + gnext = __PKHBT(gnext2, gnext1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* y(n) += gN-1(n) * vN-1 */ + /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */ + /* y(n) += gN-2(n) * vN-2 */ + /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */ + acc = __SMLALD(gnext, v, acc); + + + /* Process sample for 4th, 8th ...taps */ + /* Read gN-4(n-1) from state */ + gcurr = *px1++; + /* Process sample for 4th, 8th .. taps */ + /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN-3(n) = kN-3 * fN-1(n) + gN-1(n-1) */ + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext1 = (q15_t) __SSAT(gnext, 16); + /* write gN-3(n) for the next sample process */ + *px2++ = (q15_t) gnext1; + + + /* Process sample for 5th, 9th ...taps */ + /* Read gN-5(n-1) from state */ + gcurr = *px1++; + /* Process sample for 5th, 9th .. taps */ + /* fN-5(n) = fN-4(n) - kN-4 * gN-5(n-1) */ + fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15); + fcurr = __SSAT(fcurr, 16); + /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */ + gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr; + gnext2 = (q15_t) __SSAT(gnext, 16); + /* write gN-4(n) for the next sample process */ + *px2++ = (q15_t) gnext2; + + /* Read vN-3 and vN-4 at a time */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + v = *__SIMD32(pv)++; + +#else + + v1 = *pv++; + v2 = *pv++; + +#ifndef ARM_MATH_BIG_ENDIAN + + v = __PKHBT(v1, v2, 16); + +#else + + v = __PKHBT(v2, v1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + + /* Pack gN-3(n) and gN-4(n) */ +#ifndef ARM_MATH_BIG_ENDIAN + + gnext = __PKHBT(gnext1, gnext2, 16); + +#else + + gnext = __PKHBT(gnext2, gnext1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* y(n) += gN-4(n) * vN-4 */ + /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */ + /* y(n) += gN-3(n) * vN-3 */ + /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */ + acc = __SMLALD(gnext, v, acc); + + tapCnt--; + + } + + fnext = fcurr; + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = (numStages - 1u) % 0x4u; + + while(tapCnt > 0u) + { + gcurr = *px1++; + /* Process sample for last taps */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext = __SSAT(gnext, 16); + /* Output samples for last taps */ + acc += (q31_t) (((q31_t) gnext * (*pv++))); + *px2++ = (q15_t) gnext; + fcurr = fnext; + + tapCnt--; + } + + /* y(n) += g0(n) * v0 */ + acc += (q31_t) (((q31_t) fnext * (*pv++))); + + out = (q15_t) __SSAT(acc >> 15, 16); + *px2++ = (q15_t) fnext; + + /* write out into pDst */ + *pDst++ = out; + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 1u; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + stgCnt = (numStages >> 2u); + + /* copy data */ + while(stgCnt > 0u) + { +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + +#else + + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + stgCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + stgCnt = (numStages) % 0x4u; + + /* copy data */ + while(stgCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + stgCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */ + uint32_t stgCnt; /* Temporary variables for counts */ + q63_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* Temporary variables for counts */ + q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + q15_t *pState; /* State pointer */ + q15_t *pStateCurnt; /* State current pointer */ + q15_t out; /* Temporary variable for output */ + + + blkCnt = blockSize; + + pState = &S->pState[0]; + + /* Sample processing */ + while(blkCnt > 0u) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + tapCnt = numStages; + + while(tapCnt > 0u) + { + gcurr = *px1++; + /* Process sample */ + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = fcurr - ((gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = ((fnext * (*pk++)) >> 15) + gcurr; + gnext = __SSAT(gnext, 16); + /* Output samples */ + /* y(n) += gN(n) * vN */ + acc += (q31_t) ((gnext * (*pv++))); + /* write gN(n) into state for next sample processing */ + *px2++ = (q15_t) gnext; + /* Update f values for next coefficient processing */ + fcurr = fnext; + + tapCnt--; + } + + /* y(n) += g0(n) * v0 */ + acc += (q31_t) ((fnext * (*pv++))); + + out = (q15_t) __SSAT(acc >> 15, 16); + *px2++ = (q15_t) fnext; + + /* write out into pDst */ + *pDst++ = out; + + /* Advance the state pointer by 1 to process the next group of samples */ + pState = pState + 1u; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + stgCnt = numStages; + + /* copy data */ + while(stgCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + stgCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + + + + +/** + * @} end of IIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q31.c new file mode 100644 index 000000000..adfd4dfb5 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q31.c @@ -0,0 +1,345 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_iir_lattice_q31.c +* +* Description: Q31 IIR lattice filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Processing function for the Q31 IIR lattice filter. + * @param[in] *S points to an instance of the Q31 IIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 2*log2(numStages) bits. + * After all multiply-accumulates are performed, the 2.62 accumulator is saturated to 1.32 format and then truncated to 1.31 format. + */ + +void arm_iir_lattice_q31( + const arm_iir_lattice_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */ + q63_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* Temporary variables for counts */ + q31_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + q31_t *pState; /* State pointer */ + q31_t *pStateCurnt; /* State current pointer */ + + blkCnt = blockSize; + + pState = &S->pState[0]; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* Sample processing */ + while(blkCnt > 0u) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + + /* Process sample for first tap */ + gcurr = *px1++; + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* write gN-1(n-1) into state for next sample processing */ + *px2++ = gnext; + /* y(n) += gN(n) * vN */ + acc += ((q63_t) gnext * *pv++); + + /* Update f values for next coefficient processing */ + fcurr = fnext; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = (numStages - 1u) >> 2; + + while(tapCnt > 0u) + { + + /* Process sample for 2nd, 6th .. taps */ + /* Read gN-2(n-1) from state buffer */ + gcurr = *px1++; + /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* y(n) += gN-1(n) * vN-1 */ + /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-1(n) into state for next sample processing */ + *px2++ = gnext; + + /* Process sample for 3nd, 7th ...taps */ + /* Read gN-3(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 3rd, 7th .. taps */ + /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */ + fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31)); + /* y(n) += gN-2(n) * vN-2 */ + /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-2(n) into state for next sample processing */ + *px2++ = gnext; + + + /* Process sample for 4th, 8th ...taps */ + /* Read gN-4(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 4th, 8th .. taps */ + /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* y(n) += gN-3(n) * vN-3 */ + /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-3(n) into state for next sample processing */ + *px2++ = gnext; + + + /* Process sample for 5th, 9th ...taps */ + /* Read gN-5(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 5th, 9th .. taps */ + /* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */ + fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31)); + /* y(n) += gN-4(n) * vN-4 */ + /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-4(n) into state for next sample processing */ + *px2++ = gnext; + + tapCnt--; + + } + + fnext = fcurr; + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = (numStages - 1u) % 0x4u; + + while(tapCnt > 0u) + { + gcurr = *px1++; + /* Process sample for last taps */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* Output samples for last taps */ + acc += ((q63_t) gnext * *pv++); + *px2++ = gnext; + fcurr = fnext; + + tapCnt--; + + } + + /* y(n) += g0(n) * v0 */ + acc += (q63_t) fnext *( + *pv++); + + *px2++ = fnext; + + /* write out into pDst */ + *pDst++ = (q31_t) (acc >> 31u); + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 1u; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + + } + + /* Calculate remaining number of copies */ + tapCnt = (numStages) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + }; + +#else + + /* Run the below code for Cortex-M0 */ + /* Sample processing */ + while(blkCnt > 0u) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + tapCnt = numStages; + + while(tapCnt > 0u) + { + gcurr = *px1++; + /* Process sample */ + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = + clip_q63_to_q31(((q63_t) fcurr - + ((q31_t) (((q63_t) gcurr * (*pk)) >> 31)))); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = + clip_q63_to_q31(((q63_t) gcurr + + ((q31_t) (((q63_t) fnext * (*pk++)) >> 31)))); + /* Output samples */ + /* y(n) += gN(n) * vN */ + acc += ((q63_t) gnext * *pv++); + /* write gN-1(n-1) into state for next sample processing */ + *px2++ = gnext; + /* Update f values for next coefficient processing */ + fcurr = fnext; + + tapCnt--; + } + + /* y(n) += g0(n) * v0 */ + acc += (q63_t) fnext *( + *pv++); + + *px2++ = fnext; + + /* write out into pDst */ + *pDst++ = (q31_t) (acc >> 31u); + + /* Advance the state pointer by 1 to process the next group of samples */ + pState = pState + 1u; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + + + + +/** + * @} end of IIR_Lattice group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_f32.c new file mode 100644 index 000000000..90fa8ae1e --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_f32.c @@ -0,0 +1,434 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_f32.c +* +* Description: Processing function for the floating-point LMS filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup LMS Least Mean Square (LMS) Filters + * + * LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions. + * LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal. + * Adaptive filters are often used in communication systems, equalizers, and noise removal. + * The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types. + * The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal. + * + * An LMS filter consists of two components as shown below. + * The first component is a standard transversal or FIR filter. + * The second component is a coefficient update mechanism. + * The LMS filter has two input signals. + * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter. + * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input. + * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input. + * This "error signal" tends towards zero as the filter adapts. + * The LMS processing functions accept the input and reference input signals and generate the filter output and error signal. + * \image html LMS.gif "Internal structure of the Least Mean Square filter" + * + * The functions operate on blocks of data and each call to the function processes + * blockSize samples through the filter. + * pSrc points to input signal, pRef points to reference signal, + * pOut points to output signal and pErr points to error signal. + * All arrays contain blockSize values. + * + * The functions operate on a block-by-block basis. + * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis. + * The convergence of the LMS filter is slower compared to the normalized LMS algorithm. + * + * \par Algorithm: + * The output signal y[n] is computed by a standard FIR filter: + *
    
+ *     y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]    
+ * 
+ * + * \par + * The error signal equals the difference between the reference signal d[n] and the filter output: + *
    
+ *     e[n] = d[n] - y[n].    
+ * 
+ * + * \par + * After each sample of the error signal is computed, the filter coefficients b[k] are updated on a sample-by-sample basis: + *
    
+ *     b[k] = b[k] + e[n] * mu * x[n-k],  for k=0, 1, ..., numTaps-1    
+ * 
+ * where mu is the step size and controls the rate of coefficient convergence. + *\par + * In the APIs, pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the order: + * \par + *
    
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}    
+ * 
+ * \par + * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples. + * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters, + * to be avoided and yields a significant speed improvement. + * The state variables are updated after each block of data is processed. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter and + * coefficient and state arrays cannot be shared among instances. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 3 different data type filter instance structures + *
    
+ *    arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu};    
+ *    arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift};    
+ *    arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift};    
+ * 
+ * where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer; mu is the step size parameter; and postShift is the shift applied to coefficients. + * + * \par Fixed-Point Behavior: + * Care must be taken when using the Q15 and Q31 versions of the LMS filter. + * The following issues must be considered: + * - Scaling of coefficients + * - Overflow and saturation + * + * \par Scaling of Coefficients: + * Filter coefficients are represented as fractional values and + * coefficients are restricted to lie in the range [-1 +1). + * The fixed-point functions have an additional scaling parameter postShift. + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * This essentially scales the filter coefficients by 2^postShift and + * allows the filter coefficients to exceed the range [+1 -1). + * The value of postShift is set by the user based on the expected gain through the system being modeled. + * + * \par Overflow and Saturation: + * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are + * described separately as part of the function specific documentation below. + */ + +/** + * @addtogroup LMS + * @{ + */ + +/** + * @details + * This function operates on floating-point data types. + * + * @brief Processing function for floating-point LMS filter. + * @param[in] *S points to an instance of the floating-point LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_lms_f32( + const arm_lms_instance_f32 * S, + float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + float32_t sum, e, d; /* accumulator, error, reference data sample */ + float32_t w = 0.0f; /* weight factor */ + + e = 0.0f; + d = 0.0f; + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + blkCnt = blockSize; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result in the accumulator, store in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Calculation of Weighting factor for the updating filter coefficients */ + w = e * mu; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + *pb = *pb + (w * (*px++)); + pb++; + + *pb = *pb + (w * (*px++)); + pb++; + + *pb = *pb + (w * (*px++)); + pb++; + + *pb = *pb + (w * (*px++)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + *pb = *pb + (w * (*px++)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1u) samples copy */ + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is stored in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Weighting factor for the LMS version */ + w = e * mu; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + *pb = *pb + (w * (*px++)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + * start of the state buffer. This prepares the state buffer for the + * next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1u) samples */ + tapCnt = (numTaps - 1u); + + /* Copy the data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of LMS group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_f32.c new file mode 100644 index 000000000..a2f51240a --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_f32.c @@ -0,0 +1,90 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_init_f32.c +* +* Description: Floating-point LMS filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Initialization function for floating-point LMS filter. + * @param[in] *S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to the coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +/** + * \par Description: + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_lms_f32(). + */ + +void arm_lms_init_f32( + arm_lms_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps */ + memset(pState, 0, (numTaps + (blockSize - 1)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; +} + +/** + * @} end of LMS group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q15.c new file mode 100644 index 000000000..8f42949a6 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q15.c @@ -0,0 +1,100 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_init_q15.c +* +* Description: Q15 LMS filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + +/** +* @brief Initialization function for the Q15 LMS filter. +* @param[in] *S points to an instance of the Q15 LMS filter structure. +* @param[in] numTaps number of filter coefficients. +* @param[in] *pCoeffs points to the coefficient buffer. +* @param[in] *pState points to the state buffer. +* @param[in] mu step size that controls filter coefficient updates. +* @param[in] blockSize number of samples to process. +* @param[in] postShift bit shift applied to coefficients. +* @return none. +* +* \par Description: +* pCoeffs points to the array of filter coefficients stored in time reversed order: +*
    
+*    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+* 
+* The initial filter coefficients serve as a starting point for the adaptive filter. +* pState points to the array of state variables and size of array is +* numTaps+blockSize-1 samples, where blockSize is the number of +* input samples processed by each call to arm_lms_q15(). +*/ + +void arm_lms_init_q15( + arm_lms_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint32_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Assign postShift value to be applied */ + S->postShift = postShift; + +} + +/** + * @} end of LMS group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q31.c new file mode 100644 index 000000000..58edb659b --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_q31.c @@ -0,0 +1,100 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_init_q31.c +* +* Description: Q31 LMS filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Initialization function for Q31 LMS filter. + * @param[in] *S points to an instance of the Q31 LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + * @return none. + * + * \par Description: + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, + * where blockSize is the number of input samples processed by each call to + * arm_lms_q31(). + */ + +void arm_lms_init_q31( + arm_lms_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint32_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, ((uint32_t) numTaps + (blockSize - 1u)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Assign postShift value to be applied */ + S->postShift = postShift; + +} + +/** + * @} end of LMS group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_f32.c new file mode 100644 index 000000000..b2ac45206 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_f32.c @@ -0,0 +1,456 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_norm_f32.c +* +* Description: Processing function for the floating-point Normalised LMS. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup LMS_NORM Normalized LMS Filters + * + * This set of functions implements a commonly used adaptive filter. + * It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization + * factor which increases the adaptation rate of the filter. + * The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types. + * + * A normalized least mean square (NLMS) filter consists of two components as shown below. + * The first component is a standard transversal or FIR filter. + * The second component is a coefficient update mechanism. + * The NLMS filter has two input signals. + * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter. + * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input. + * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input. + * This "error signal" tends towards zero as the filter adapts. + * The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal. + * \image html LMS.gif "Internal structure of the NLMS adaptive filter" + * + * The functions operate on blocks of data and each call to the function processes + * blockSize samples through the filter. + * pSrc points to input signal, pRef points to reference signal, + * pOut points to output signal and pErr points to error signal. + * All arrays contain blockSize values. + * + * The functions operate on a block-by-block basis. + * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis. + * The convergence of the LMS filter is slower compared to the normalized LMS algorithm. + * + * \par Algorithm: + * The output signal y[n] is computed by a standard FIR filter: + *
    
+ *     y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]    
+ * 
+ * + * \par + * The error signal equals the difference between the reference signal d[n] and the filter output: + *
    
+ *     e[n] = d[n] - y[n].    
+ * 
+ * + * \par + * After each sample of the error signal is computed the instanteous energy of the filter state variables is calculated: + *
    
+ *    E = x[n]^2 + x[n-1]^2 + ... + x[n-numTaps+1]^2.    
+ * 
+ * The filter coefficients b[k] are then updated on a sample-by-sample basis: + *
    
+ *     b[k] = b[k] + e[n] * (mu/E) * x[n-k],  for k=0, 1, ..., numTaps-1    
+ * 
+ * where mu is the step size and controls the rate of coefficient convergence. + *\par + * In the APIs, pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the order: + * \par + *
    
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}    
+ * 
+ * \par + * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples. + * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters, + * to be avoided and yields a significant speed improvement. + * The state variables are updated after each block of data is processed. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter and + * coefficient and state arrays cannot be shared among instances. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * \par + * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function. + * \par Fixed-Point Behavior: + * Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter. + * The following issues must be considered: + * - Scaling of coefficients + * - Overflow and saturation + * + * \par Scaling of Coefficients: + * Filter coefficients are represented as fractional values and + * coefficients are restricted to lie in the range [-1 +1). + * The fixed-point functions have an additional scaling parameter postShift. + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * This essentially scales the filter coefficients by 2^postShift and + * allows the filter coefficients to exceed the range [+1 -1). + * The value of postShift is set by the user based on the expected gain through the system being modeled. + * + * \par Overflow and Saturation: + * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are + * described separately as part of the function specific documentation below. + */ + + +/** + * @addtogroup LMS_NORM + * @{ + */ + + + /** + * @brief Processing function for floating-point normalized LMS filter. + * @param[in] *S points to an instance of the floating-point normalized LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_lms_norm_f32( + arm_lms_norm_instance_f32 * S, + float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + float32_t energy; /* Energy of the input */ + float32_t sum, e, d; /* accumulator, error, reference data sample */ + float32_t w, x0, in; /* weight factor, temporary variable to hold input sample and state */ + + /* Initializations of error, difference, Coefficient update */ + e = 0.0f; + d = 0.0f; + w = 0.0f; + + energy = S->energy; + x0 = S->x0; + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= x0 * x0; + energy += in * in; + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result in the accumulator, store in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Calculation of Weighting factor for updating filter coefficients */ + /* epsilon value 0.000000119209289f */ + w = (e * mu) / (energy + 0.000000119209289f); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + *pb += w * (*px++); + pb++; + + *pb += w * (*px++); + pb++; + + *pb += w * (*px++); + pb++; + + *pb += w * (*px++); + pb++; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + *pb += w * (*px++); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + S->energy = energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1u)/4 samples copy */ + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= x0 * x0; + energy += in * in; + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result in the accumulator is stored in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Calculation of Weighting factor for updating filter coefficients */ + /* epsilon value 0.000000119209289f */ + w = (e * mu) / (energy + 0.000000119209289f); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCcoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + *pb += w * (*px++); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + S->energy = energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1u) samples */ + tapCnt = (numTaps - 1u); + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_f32.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_f32.c new file mode 100644 index 000000000..3d31cfb25 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_f32.c @@ -0,0 +1,100 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_norm_init_f32.c +* +* Description: Floating-point NLMS filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS_NORM + * @{ + */ + + /** + * @brief Initialization function for floating-point normalized LMS filter. + * @param[in] *S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par Description: + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, + * where blockSize is the number of input samples processed by each call to arm_lms_norm_f32(). + */ + +void arm_lms_norm_init_f32( + arm_lms_norm_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Initialise Energy to zero */ + S->energy = 0.0f; + + /* Initialise x0 to zero */ + S->x0 = 0.0f; + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q15.c new file mode 100644 index 000000000..a1cf1b001 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q15.c @@ -0,0 +1,107 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_norm_init_q15.c +* +* Description: Q15 NLMS initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" +#include "arm_common_tables.h" + +/** + * @addtogroup LMS_NORM + * @{ + */ + + /** + * @brief Initialization function for Q15 normalized LMS filter. + * @param[in] *S points to an instance of the Q15 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to the array of state variables and size of array is + * numTaps+blockSize-1 samples, where blockSize is the number of input samples processed + * by each call to arm_lms_norm_q15(). + */ + +void arm_lms_norm_init_q15( + arm_lms_norm_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint8_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q15_t)); + + /* Assign post Shift value applied to coefficients */ + S->postShift = postShift; + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Initialize reciprocal pointer table */ + S->recipTable = (q15_t *) armRecipTableQ15; + + /* Initialise Energy to zero */ + S->energy = 0; + + /* Initialise x0 to zero */ + S->x0 = 0; + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q31.c new file mode 100644 index 000000000..a2fae7b38 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_q31.c @@ -0,0 +1,106 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_norm_init_q31.c +* +* Description: Q31 NLMS initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" +#include "arm_common_tables.h" + +/** + * @addtogroup LMS_NORM + * @{ + */ + + /** + * @brief Initialization function for Q31 normalized LMS filter. + * @param[in] *S points to an instance of the Q31 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, + * where blockSize is the number of input samples processed by each call to arm_lms_norm_q31(). + */ + +void arm_lms_norm_init_q31( + arm_lms_norm_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint8_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q31_t)); + + /* Assign post Shift value applied to coefficients */ + S->postShift = postShift; + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Initialize reciprocal pointer table */ + S->recipTable = (q31_t *) armRecipTableQ31; + + /* Initialise Energy to zero */ + S->energy = 0; + + /* Initialise x0 to zero */ + S->x0 = 0; + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q15.c new file mode 100644 index 000000000..a1229a203 --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q15.c @@ -0,0 +1,435 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_norm_q15.c +* +* Description: Q15 NLMS filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS_NORM + * @{ + */ + +/** +* @brief Processing function for Q15 normalized LMS filter. +* @param[in] *S points to an instance of the Q15 normalized LMS filter structure. +* @param[in] *pSrc points to the block of input data. +* @param[in] *pRef points to the block of reference data. +* @param[out] *pOut points to the block of output data. +* @param[out] *pErr points to the block of error data. +* @param[in] blockSize number of samples to process. +* @return none. +* +* Scaling and Overflow Behavior: +* \par +* The function is implemented using a 64-bit internal accumulator. +* Both coefficients and state variables are represented in 1.15 format and +* multiplications yield a 2.30 result. The 2.30 intermediate results are +* accumulated in a 64-bit accumulator in 34.30 format. +* There is no risk of internal overflow with this approach and the full +* precision of intermediate multiplications is preserved. After all additions +* have been performed, the accumulator is truncated to 34.15 format by +* discarding low 15 bits. Lastly, the accumulator is saturated to yield a +* result in 1.15 format. +* +* \par +* In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted. +* + */ + +void arm_lms_norm_q15( + arm_lms_norm_instance_q15 * S, + q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + q15_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q31_t energy; /* Energy of the input */ + q63_t acc; /* Accumulator */ + q15_t e = 0, d = 0; /* error, reference data sample */ + q15_t w = 0, in; /* weight factor and state */ + q15_t x0; /* temporary variable to hold input sample */ + //uint32_t shift = (uint32_t) S->postShift + 1u; /* Shift to be applied to the output */ + q15_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */ + q15_t postShift; /* Post shift to be applied to weight after reciprocal calculation */ + q31_t coef; /* Teporary variable for coefficient */ + q31_t acc_l, acc_h; + int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */ + int32_t uShift = (32 - lShift); + + energy = S->energy; + x0 = S->x0; + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= (((q31_t) x0 * (x0)) >> 15); + energy += (((q31_t) in * (in)) >> 15); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0u) + { + + /* Perform the multiply-accumulate */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + +#else + + acc += (((q31_t) * px++ * (*pb++))); + acc += (((q31_t) * px++ * (*pb++))); + acc += (((q31_t) * px++ * (*pb++))); + acc += (((q31_t) * px++ * (*pb++))); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += (((q31_t) * px++ * (*pb++))); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16u); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q15_t) acc; + *pErr++ = e; + + /* Calculation of 1/energy */ + postShift = arm_recip_q15((q15_t) energy + DELTA_Q15, + &oneByEnergy, S->recipTable); + + /* Calculation of e * mu value */ + errorXmu = (q15_t) (((q31_t) e * mu) >> 15); + + /* Calculation of (e * mu) * (1/energy) value */ + acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift)); + + /* Weighting factor for the normalized version */ + w = (q15_t) __SSAT((q31_t) acc, 16); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while(tapCnt > 0u) + { + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1u; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q15_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1u) >> 2; + + while(tapCnt > 0u) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + +#else + + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + +#endif + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= (((q31_t) x0 * (x0)) >> 15); + energy += (((q31_t) in * (in)) >> 15); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += (((q31_t) * px++ * (*pb++))); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16u); + + /* Converting the result to 1.15 format */ + //acc = __SSAT((acc >> (16u - shift)), 16u); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q15_t) acc; + *pErr++ = e; + + /* Calculation of 1/energy */ + postShift = arm_recip_q15((q15_t) energy + DELTA_Q15, + &oneByEnergy, S->recipTable); + + /* Calculation of e * mu value */ + errorXmu = (q15_t) (((q31_t) e * mu) >> 15); + + /* Calculation of (e * mu) * (1/energy) value */ + acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift)); + + /* Weighting factor for the normalized version */ + w = (q15_t) __SSAT((q31_t) acc, 16); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1u; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q15_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* copy (numTaps - 1u) data */ + tapCnt = (numTaps - 1u); + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + + +/** + * @} end of LMS_NORM group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q31.c new file mode 100644 index 000000000..791a8637c --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q31.c @@ -0,0 +1,426 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_norm_q31.c +* +* Description: Processing function for the Q31 NLMS filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS_NORM + * @{ + */ + +/** +* @brief Processing function for Q31 normalized LMS filter. +* @param[in] *S points to an instance of the Q31 normalized LMS filter structure. +* @param[in] *pSrc points to the block of input data. +* @param[in] *pRef points to the block of reference data. +* @param[out] *pOut points to the block of output data. +* @param[out] *pErr points to the block of error data. +* @param[in] blockSize number of samples to process. +* @return none. +* +* Scaling and Overflow Behavior: +* \par +* The function is implemented using an internal 64-bit accumulator. +* The accumulator has a 2.62 format and maintains full precision of the intermediate +* multiplication results but provides only a single guard bit. +* Thus, if the accumulator result overflows it wraps around rather than clip. +* In order to avoid overflows completely the input signal must be scaled down by +* log2(numTaps) bits. The reference signal should not be scaled down. +* After all multiply-accumulates are performed, the 2.62 accumulator is shifted +* and saturated to 1.31 format to yield the final result. +* The output signal and error signal are in 1.31 format. +* +* \par +* In this filter, filter coefficients are updated for each sample and the +* updation of filter cofficients are saturted. +* +*/ + +void arm_lms_norm_q31( + arm_lms_norm_instance_q31 * S, + q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + q31_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q63_t energy; /* Energy of the input */ + q63_t acc; /* Accumulator */ + q31_t e = 0, d = 0; /* error, reference data sample */ + q31_t w = 0, in; /* weight factor and state */ + q31_t x0; /* temporary variable to hold input sample */ +// uint32_t shift = 32u - ((uint32_t) S->postShift + 1u); /* Shift to be applied to the output */ + q31_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */ + q31_t postShift; /* Post shift to be applied to weight after reciprocal calculation */ + q31_t coef; /* Temporary variable for coef */ + q31_t acc_l, acc_h; /* temporary input */ + uint32_t uShift = ((uint32_t) S->postShift + 1u); + uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */ + + energy = S->energy; + x0 = S->x0; + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while(blkCnt > 0u) + { + + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy = (q31_t) ((((q63_t) energy << 32) - + (((q63_t) x0 * x0) << 1)) >> 32); + energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + acc += ((q63_t) (*px++)) * (*pb++); + acc += ((q63_t) (*px++)) * (*pb++); + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q31_t) acc; + *pErr++ = e; + + /* Calculates the reciprocal of energy */ + postShift = arm_recip_q31(energy + DELTA_Q31, + &oneByEnergy, &S->recipTable[0]); + + /* Calculation of product of (e * mu) */ + errorXmu = (q31_t) (((q63_t) e * mu) >> 31); + + /* Weighting factor for the normalized version */ + w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift)); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + + /* coef is in 2.30 format */ + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + /* get coef in 1.31 format by left shifting */ + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + /* update coefficient buffer to next coefficient */ + pb++; + + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q31_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1u) samples copy */ + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while(blkCnt > 0u) + { + + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy = + (q31_t) ((((q63_t) energy << 32) - (((q63_t) x0 * x0) << 1)) >> 32); + energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Converting the result to 1.31 format */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + + //acc = (q31_t) (acc >> shift); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q31_t) acc; + *pErr++ = e; + + /* Calculates the reciprocal of energy */ + postShift = + arm_recip_q31(energy + DELTA_Q31, &oneByEnergy, &S->recipTable[0]); + + /* Calculation of product of (e * mu) */ + errorXmu = (q31_t) (((q63_t) e * mu) >> 31); + + /* Weighting factor for the normalized version */ + w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift)); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + /* coef is in 2.30 format */ + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + /* get coef in 1.31 format by left shifting */ + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + /* update coefficient buffer to next coefficient */ + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q31_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + start of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop for (numTaps - 1u) samples copy */ + tapCnt = (numTaps - 1u); + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q15.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q15.c new file mode 100644 index 000000000..91237c12d --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q15.c @@ -0,0 +1,374 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_q15.c +* +* Description: Processing function for the Q15 LMS filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Processing function for Q15 LMS filter. + * @param[in] *S points to an instance of the Q15 LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par Scaling and Overflow Behavior: + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + * + * \par + * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted. + * + */ + +void arm_lms_q15( + const arm_lms_instance_q15 * S, + q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t mu = S->mu; /* Adaptive factor */ + q15_t *px; /* Temporary pointer for state */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q63_t acc; /* Accumulator */ + q15_t e = 0; /* error of data sample */ + q15_t alpha; /* Intermediate constant for taps update */ + q31_t acc_l, acc_h; + int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */ + int32_t uShift = (32 - lShift); + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t coef; /* Teporary variable for coefficient */ + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initializing blkCnt with blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2u; + + while(tapCnt > 0u) + { + /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + /* Perform the multiply-accumulate */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + +#else + + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q15_t) acc; + + *pErr++ = (q15_t) e; + + /* Compute alpha i.e. intermediate constant for taps update */ + alpha = (q15_t) (((q31_t) e * (mu)) >> 15); + + /* Initialize state pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2u; + + /* Update filter coefficients */ + while(tapCnt > 0u) + { + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1u) >> 2; + + while(tapCnt > 0u) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; +#else + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; +#endif + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += (q63_t) ((q31_t) (*px++) * (*pb++)); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q15_t) acc; + + *pErr++ = (q15_t) e; + + /* Compute alpha i.e. intermediate constant for taps update */ + alpha = (q15_t) (((q31_t) e * (mu)) >> 15); + + /* Initialize pState pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + *pb++ += (q15_t) (((q31_t) alpha * (*px++)) >> 15); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + start of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1u) samples */ + tapCnt = (numTaps - 1u); + + /* Copy the data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of LMS group + */ diff --git a/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q31.c b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q31.c new file mode 100644 index 000000000..c43d55d1d --- /dev/null +++ b/src/modules/mathlib/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q31.c @@ -0,0 +1,364 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010 ARM Limited. All rights reserved. +* +* $Date: 15. February 2012 +* $Revision: V1.1.0 +* +* Project: CMSIS DSP Library +* Title: arm_lms_q31.c +* +* Description: Processing function for the Q31 LMS filter. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Version 1.1.0 2012/02/15 +* Updated with more optimizations, bug fixes and minor API changes. +* +* Version 1.0.10 2011/7/15 +* Big Endian support added and Merged M0 and M3/M4 Source code. +* +* Version 1.0.3 2010/11/29 +* Re-organized the CMSIS folders and updated documentation. +* +* Version 1.0.2 2010/11/11 +* Documentation updated. +* +* Version 1.0.1 2010/10/05 +* Production release and review comments incorporated. +* +* Version 1.0.0 2010/09/20 +* Production release and review comments incorporated +* +* Version 0.0.7 2010/06/10 +* Misra-C changes done +* -------------------------------------------------------------------- */ + +#include "arm_math.h" +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Processing function for Q31 LMS filter. + * @param[in] *S points to an instance of the Q15 LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par Scaling and Overflow Behavior: + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate + * multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clips. + * In order to avoid overflows completely the input signal must be scaled down by + * log2(numTaps) bits. + * The reference signal should not be scaled down. + * After all multiply-accumulates are performed, the 2.62 accumulator is shifted + * and saturated to 1.31 format to yield the final result. + * The output signal and error signal are in 1.31 format. + * + * \par + * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted. + */ + +void arm_lms_q31( + const arm_lms_instance_q31 * S, + q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t mu = S->mu; /* Adaptive factor */ + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q63_t acc; /* Accumulator */ + q31_t e = 0; /* error of data sample */ + q31_t alpha; /* Intermediate constant for taps update */ + q31_t coef; /* Temporary variable for coef */ + q31_t acc_l, acc_h; /* temporary input */ + uint32_t uShift = ((uint32_t) S->postShift + 1u); + uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */ + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initializing blkCnt with blockSize */ + blkCnt = blockSize; + + +#ifndef ARM_MATH_CM0 + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + /* acc += b[N] * x[n-N] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* acc += b[N-1] * x[n-N-1] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* acc += b[N-2] * x[n-N-2] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* acc += b[N-3] * x[n-N-3] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q31_t) acc; + + *pErr++ = (q31_t) e; + + /* Compute alpha i.e. intermediate constant for taps update */ + alpha = (q31_t) (((q63_t) e * mu) >> 31); + + /* Initialize state pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while(tapCnt > 0u) + { + /* coef is in 2.30 format */ + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + /* get coef in 1.31 format by left shifting */ + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + /* update coefficient buffer to next coefficient */ + pb++; + + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1u) samples copy */ + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while(blkCnt > 0u) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Store the result from accumulator into the destination buffer. */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q31_t) acc; + + *pErr++ = (q31_t) e; + + /* Weighting factor for the LMS version */ + alpha = (q31_t) (((q63_t) e * mu) >> 31); + + /* Initialize pState pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while(tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb += (coef << 1u); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + start of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1u) samples */ + tapCnt = (numTaps - 1u); + + /* Copy the data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #ifndef ARM_MATH_CM0 */ + +} + +/** + * @} end of LMS group + */ -- cgit v1.2.3