1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  * Unless required by applicable law or agreed to in writing, software
10  *
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "intrinsic_utils.h"
20 
21 #include <array>
22 #include <cmath>
23 #include <functional>
24 #include <utility>
25 #include <vector>
26 
27 #include <assert.h>
28 
29 // We conditionally include neon optimizations for ARM devices
30 #pragma push_macro("USE_NEON")
31 #undef USE_NEON
32 
33 #if defined(__ARM_NEON__) || defined(__aarch64__)
34 #include <arm_neon.h>
35 #define USE_NEON
36 #endif
37 
38 // Use dither to prevent subnormals for CPUs that raise an exception.
39 #pragma push_macro("USE_DITHER")
40 #undef USE_DITHER
41 
42 #if defined(__i386__) || defined(__x86_x64__)
43 #define USE_DITHER
44 #endif
45 
46 namespace android::audio_utils {
47 
48 static constexpr size_t kBiquadNumCoefs  = 5;
49 static constexpr size_t kBiquadNumDelays = 2;
50 
51 /**
52  * The BiquadDirect2Transpose is a low overhead
53  * Biquad filter with coefficients b0, b1, b2, a1, a2.
54  *
55  * This can be used by itself, but it is preferred for best data management
56  * to use the BiquadFilter abstraction below.
57  *
58  * T is the data type (scalar or vector).
59  * F is the filter coefficient type.  It is either a scalar or vector (matching T).
60  */
61 template <typename T, typename F>
62 struct BiquadDirect2Transpose {
63     F coef_[5]; // these are stored with the denominator a's negated.
64     T s_[2]; // delay states
65 
66     // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
67     // as expressed by a bitmask.
68     static inline constexpr size_t required_occupancies_[] = {
69         0x1,  // constant scale
70         0x3,  // single zero
71         0x7,  // double zero
72         0x9,  // single pole
73         0xb,  // (11) first order IIR
74         0x1b, // (27) double pole + single zero
75         0x1f, // (31) second order IIR (full Biquad)
76     };
77 
78     // Take care the order of arguments - starts with b's then goes to a's.
79     // The a's are "positive" reference, some filters take negative.
80     BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
81             const T& s0 = {}, const T& s1 = {})
82         // : coef_{b0, b1, b2, -a1, -a2}
83         : coef_{ b0,
84             b1,
85             b2,
86             intrinsics::vneg(a1),
87             intrinsics::vneg(a2) }
88         , s_{ s0, s1 } {
89     }
90 
91     // D is the data type.  It must be the same element type of T or F.
92     // Take care the order of input and output.
93     template<typename D, size_t OCCUPANCY = 0x1f>
94     __attribute__((always_inline)) // required for 1ch speedup (30% faster)
processBiquadDirect2Transpose95     void process(D* output, const D* input, size_t frames, size_t stride) {
96         using namespace intrinsics;
97         // For SSE it is possible to vdup F to T if F is scalar.
98         const F b0 = coef_[0];         // b0
99         const F b1 = coef_[1];         // b1
100         const F b2 = coef_[2];         // b2
101         const F negativeA1 = coef_[3]; // -a1
102         const F negativeA2 = coef_[4]; // -a2
103         T s[2] = { s_[0], s_[1] };
104         T xn, yn; // OK to declare temps outside loop rather than at the point of initialization.
105 #ifdef USE_DITHER
106         constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
107         T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
108 #endif
109 
110         // Unroll control.  Make sure the constexpr remains constexpr :-).
111         constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
112         constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2;   // below this won't be unrolled.
113         constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16;  // above this won't be unrolled.
114         constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
115                 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
116         size_t remainder = 0;
117         if constexpr (UNROLL_LOOPS > 1) {
118             remainder = frames % UNROLL_LOOPS;
119             frames /= UNROLL_LOOPS;
120         }
121 
122         // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
123         // The other alternative is to use a MACRO, but that doesn't read as well.
124         const auto KERNEL = [&]() __attribute__((always_inline)) {
125             xn = vld1<T>(input);
126             input += stride;
127 #ifdef USE_DITHER
128             xn = vadd(xn, dither);
129             dither = vneg(dither);
130 #endif
131 
132             yn = s[0];
133             if constexpr (OCCUPANCY >> 0 & 1) {
134                 yn = vmla(yn, b0, xn);
135             }
136             vst1(output, yn);
137             output += stride;
138 
139             s[0] = s[1];
140             if constexpr (OCCUPANCY >> 3 & 1) {
141                 s[0] = vmla(s[0], negativeA1, yn);
142             }
143             if constexpr (OCCUPANCY >> 1 & 1) {
144                 s[0] = vmla(s[0], b1, xn);
145             }
146             if constexpr (OCCUPANCY >> 2 & 1) {
147                 s[1] = vmul(b2, xn);
148             } else {
149                 s[1] = vdupn<T>(0.f);
150             }
151             if constexpr (OCCUPANCY >> 4 & 1) {
152                 s[1] = vmla(s[1], negativeA2, yn);
153             }
154         };
155 
156         while (frames > 0) {
157             #pragma unroll
158             for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
159                 KERNEL();
160             }
161             frames--;
162         }
163         if constexpr (UNROLL_LOOPS > 1) {
164             for (size_t i = 0; i < remainder; ++i) {
165                 KERNEL();
166             }
167         }
168         s_[0] = s[0];
169         s_[1] = s[1];
170     }
171 };
172 
173 /**
174  * A state space formulation of filtering converts a n-th order difference equation update
175  * to a first order vector difference equation. For the Biquad filter, the state space form
176  * has improved numerical precision properties with poles near the unit circle as well as
177  * increased speed due to better parallelization of the state update [1][2].
178  *
179  * [1] Raph Levien: (observerable canonical form)
180  * https://github.com/google/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
181  *
182  * [2] Julius O Smith III: (controllable canonical form)
183  * https://ccrma.stanford.edu/~jos/filters/State_Space_Filters.html
184  *
185  * The signal flow is as follows, where for scalar x and y, the matrix D is a scalar d.
186  *
187  *
188  *        +------[ d ]--------------------------+
189  *        |                         S           |
190  *  x ----+--[ B ]--(+)--[ z^-1 ]---+---[ C ]--(+)--- y
191  *                   |              |
192  *                   +----[ A ]-----+
193  *
194  * The 2nd order Biquad IIR coefficients are as follows in observerable canonical form:
195  *
196  * y = [C_1 C_2] | S_1 | + d x
197  *               | S_2 |
198  *
199  *
200  * | S_1 | = | A_11 A_12 | | S_1 | + | B_1 | x
201  * | S_2 |   | A_21 A_22 | | S_2 |   | B_2 |
202  *
203  *
204  * A_11 = -a1
205  * A_12 = 1
206  * A_21 = -a2
207  * A_22 = 0
208  *
209  * B_1 = b1 - b0 * a1
210  * B_2 = b2 - b0 * a2
211  *
212  * C_1 = 1
213  * C_2 = 0
214  *
215  * d = b0
216  *
217  * Implementation details: The state space filter is typically expressed in either observable or
218  * controllable canonical form [3].  We follow the observable form here.
219  * Raph [4] discovered that the single channel Biquad implementation can be further optimized
220  * by doing 2 filter updates at once (improving speed on NEON by about 20%).
221  * Doing 2 updates at once costs 8 multiplies / sample instead of 5 multiples / sample,
222  * but uses a 4 x 4 matrix multiply, exploiting single cycle multiply-add SIMD throughput.
223  *
224  * [3] Mathworks
225  * https://www.mathworks.com/help/control/ug/canonical-state-space-realizations.html
226  * [4] Raph Levien
227  * https://github.com/kysko/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
228  *
229  * The template variables
230  * T is the data type (scalar or vector).
231  * F is the filter coefficient type.  It is either a scalar or vector (matching T).
232  */
233 template <typename T, typename F, bool SEPARATE_CHANNEL_OPTIMIZATION = false>
234 struct BiquadStateSpace {
235     F coef_[5]; // these are stored as state-space converted.
236     T s_[2]; // delay states
237 
238     // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
239     // as expressed by a bitmask.  This must include 31.
240     static inline constexpr size_t required_occupancies_[] = {
241         1,  // constant scale
242         3,  // single zero
243         7,  // double zero
244         9,  // single pole
245         11, // first order IIR
246         27, // double pole + single zero
247         31, // second order IIR (full Biquad)
248     };
249 
250     // Take care the order of arguments - starts with b's then goes to a's.
251     // The a's are "positive" reference, some filters take negative.
252     BiquadStateSpace(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
253             const T& s0 = {}, const T& s1 = {})
254         // : coef_{b0, b1 - b0 * a1, b2 - b0 * a2, -a1, -a2}
255         : coef_{ b0,
256             intrinsics::vsub(b1, intrinsics::vmul(b0, a1)),
257             intrinsics::vsub(b2, intrinsics::vmul(b0, a2)),
258             intrinsics::vneg(a1),
259             intrinsics::vneg(a2) }
260         , s_{s0, s1} {
261     }
262 
263     // D is the data type.  It must be the same element type of T or F.
264     // Take care the order of input and output.
265     template<typename D, size_t OCCUPANCY = 0x1f>
processBiquadStateSpace266     void process(D* output, const D* input, size_t frames, size_t stride) {
267         using namespace intrinsics;
268         const F b0 = coef_[0];         // b0
269         const F b1ss = coef_[1];       // b1 - b0 * a1,
270         const F b2ss = coef_[2];       // b2 - b0 * a2,
271         const F negativeA1 = coef_[3]; // -a1
272         const F negativeA2 = coef_[4]; // -a2
273         T s[2] = { s_[0], s_[1] };
274         T x, new_s0; // OK to declare temps here rather than at the point of initialization.
275 #ifdef USE_DITHER
276         constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
277         T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
278 #endif
279         constexpr bool b0_present = (OCCUPANCY & 0x1) != 0;
280         constexpr bool a1_present = (OCCUPANCY & 0x8) != 0;
281         constexpr bool a2_present = (OCCUPANCY & 0x10) != 0;
282         constexpr bool b1ss_present = (OCCUPANCY & 0x2) != 0 ||
283                 (b0_present && a1_present);
284         constexpr bool b2ss_present = (OCCUPANCY & 0x4) != 0 ||
285                 (b0_present && a2_present);
286 
287         // Unroll control.  Make sure the constexpr remains constexpr :-).
288         constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
289         constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 1;   // below this won't be unrolled.
290         constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16;  // above this won't be unrolled.
291         constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
292                 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
293 
294         if constexpr (SEPARATE_CHANNEL_OPTIMIZATION && CHANNELS == 1 && OCCUPANCY >= 11) {
295             // Special acceleration which computes 2 samples at a time.
296             // see reference [4] for computation of this matrix.
297             intrinsics::internal_array_t<T, 4> A[4] = {
298                 {b0, b1ss, negativeA1 * b1ss + b2ss, negativeA2 * b1ss},
299                 {0, b0, b1ss, b2ss},
300                 {1, negativeA1, negativeA2 + negativeA1 * negativeA1, negativeA1 * negativeA2},
301                 {0, 1, negativeA1, negativeA2},
302                 };
303             intrinsics::internal_array_t<T, 4> y;
304             while (frames > 1) {
305                 x = vld1<T>(input);
306                 input += stride;
307 #ifdef USE_DITHER
308                 x = vadd(x, dither);
309                 dither = vneg(dither);
310 #endif
311                 y = vmul(A[0], x);
312 
313                 x = vld1<T>(input);
314                 input += stride;
315 #ifdef USE_DITHER
316                 x = vadd(x, dither);
317                 dither = vneg(dither);
318 #endif
319                 y = vmla(y, A[1], x);
320                 y = vmla(y, A[2], s[0]);
321                 y = vmla(y, A[3], s[1]);
322 
323                 vst1(output, y.v[0]);
324                 output += stride;
325 
326                 vst1(output, y.v[1]);
327                 output += stride;
328 
329                 s[0] = y.v[2];
330                 s[1] = y.v[3];
331                 frames -= 2;
332             }
333             if (frames == 0) {
334                 s_[0] = s[0];
335                 s_[1] = s[1];
336                 return;
337             }
338         }
339 
340         size_t remainder = 0;
341         if constexpr (UNROLL_LOOPS > 1) {
342             remainder = frames % UNROLL_LOOPS;
343             frames /= UNROLL_LOOPS;
344         }
345 
346         // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
347         // The other alternative is to use a MACRO, but that doesn't read as well.
348         const auto KERNEL = [&]() __attribute__((always_inline)) {
349             x = vld1<T>(input);
350             input += stride;
351 #ifdef USE_DITHER
352             x = vadd(x, dither);
353             dither = vneg(dither);
354 #endif
355             // vst1(output, vadd(s[0], vmul(b0, x)));
356             // output += stride;
357             // new_s0 = vadd(vadd(vmul(b1ss, x), vmul(negativeA1, s[0])), s[1]);
358             // s[1] = vadd(vmul(b2ss, x), vmul(negativeA2, s[0]));
359 
360             if constexpr (b0_present) {
361                 vst1(output, vadd(s[0], vmul(b0, x)));
362             } else /* constexpr */ {
363                 vst1(output, s[0]);
364             }
365             output += stride;
366             new_s0 = s[1];
367             if constexpr (b1ss_present) {
368                 new_s0 = vadd(new_s0, vmul(b1ss, x));
369             }
370             if constexpr (a1_present) {
371                 new_s0 = vadd(new_s0, vmul(negativeA1, s[0]));
372             }
373             if constexpr (b2ss_present) {
374                 s[1] = vmul(b2ss, x);
375                 if constexpr (a2_present) {
376                     s[1] = vadd(s[1], vmul(negativeA2, s[0]));
377                 }
378             } else if constexpr (a2_present) {
379                 s[1] = vmul(negativeA2, s[0]);
380             }
381             s[0] = new_s0;
382         };
383 
384         while (frames > 0) {
385             #pragma unroll
386             for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
387                 KERNEL();
388             }
389             frames--;
390         }
391         if constexpr (UNROLL_LOOPS > 1) {
392             for (size_t i = 0; i < remainder; ++i) {
393                 KERNEL();
394             }
395         }
396         s_[0] = s[0];
397         s_[1] = s[1];
398     }
399 };
400 
401 namespace details {
402 
403 // Helper methods for constructing a constexpr array of function pointers.
404 // As function pointers are efficient and have no constructor/destructor
405 // this is preferred over std::function.
406 //
407 // SC stands for SAME_COEF_PER_CHANNEL, a compile time boolean constant.
408 template <template <size_t, bool, typename ...> typename F, bool SC, size_t... Is>
make_functional_array_from_index_sequence(std::index_sequence<Is...>)409 static inline constexpr auto make_functional_array_from_index_sequence(std::index_sequence<Is...>) {
410     using first_t = decltype(&F<0, false>::func);  // type from function
411     using result_t = std::array<first_t, sizeof...(Is)>;   // type of array
412     return result_t{{F<Is, SC>::func...}};      // initialize with functions.
413 }
414 
415 template <template <size_t, bool, typename ...> typename F, size_t M, bool SC>
make_functional_array()416 static inline constexpr auto make_functional_array() {
417     return make_functional_array_from_index_sequence<F, SC>(std::make_index_sequence<M>());
418 }
419 
420 // Returns true if the poles are stable for a Biquad.
421 template <typename D>
isStable(const D & a1,const D & a2)422 static inline constexpr bool isStable(const D& a1, const D& a2) {
423     return fabs(a2) < D(1) && fabs(a1) < D(1) + a2;
424 }
425 
426 // Simplifies Biquad coefficients.
427 // TODO: consider matched pole/zero cancellation.
428 //       consider subnormal elimination for Intel processors.
429 template <typename D, typename T>
reduceCoefficients(const T & coef)430 std::array<D, kBiquadNumCoefs> reduceCoefficients(const T& coef) {
431     std::array<D, kBiquadNumCoefs> lcoef;
432     if (coef.size() == kBiquadNumCoefs + 1) {
433         // General form of Biquad.
434         // Remove matched z^-1 factors in top and bottom (e.g. coefs[0] == coefs[3] == 0).
435         size_t offset = 0;
436         for (; offset < 2 && coef[offset] == 0 && coef[offset + 3] == 0; ++offset);
437         assert(coefs[offset + 3] != 0); // hmm... shouldn't we be causal?
438 
439         // Normalize 6 coefficients to 5 for storage.
440         lcoef[0] = coef[offset] / coef[offset + 3];
441         for (size_t i = 1; i + offset < 3; ++i) {
442             lcoef[i] = coef[i + offset] / coef[offset + 3];
443             lcoef[i + 2] = coef[i + offset + 3] / coef[offset + 3];
444          }
445     } else if (coef.size() == kBiquadNumCoefs) {
446         std::copy(coef.begin(), coef.end(), lcoef.begin());
447     } else {
448         assert(coef.size() == kBiquadNumCoefs + 1 || coef.size() == kBiquadNumCoefs);
449     }
450     return lcoef;
451 }
452 
453 // Sets a container of coefficients to storage.
454 template <typename D, typename T, typename DEST>
setCoefficients(DEST & dest,size_t offset,size_t stride,size_t channelCount,const T & coef)455 static inline void setCoefficients(
456         DEST& dest, size_t offset, size_t stride, size_t channelCount, const T& coef) {
457     auto lcoef = reduceCoefficients<D, T>(coef);
458     // replicate as needed
459     for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
460         for (size_t j = 0; j < channelCount; ++j) {
461             dest[i * stride + offset + j] = lcoef[i];
462         }
463     }
464 }
465 
466 // Helper function to zero channels in the input buffer.
467 // This is used for the degenerate coefficient case which results in all zeroes.
468 template <typename D>
zeroChannels(D * out,size_t frames,size_t stride,size_t channelCount)469 void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) {
470     if (stride == channelCount) {
471         memset(out, 0, sizeof(float) * frames * channelCount);
472     } else {
473         for (size_t i = 0; i < frames; i++) {
474             memset(out, 0, sizeof(float) * channelCount);
475             out += stride;
476         }
477     }
478 }
479 
480 template <typename ConstOptions,
481         size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
biquad_filter_func_impl(F * out,const F * in,size_t frames,size_t stride,size_t channelCount,F * delays,const F * coefs,size_t localStride)482 void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride,
483         size_t channelCount, F *delays, const F *coefs, size_t localStride) {
484     using namespace android::audio_utils::intrinsics;
485 
486     constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
487     const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride;
488     using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>;
489     using KernelType = typename ConstOptions::template FilterType<T, CoefType>;
490 
491     for (size_t i = 0; i < channelCount; i += elements) {
492         T s1 = vld1<T>(&delays[0]);
493         T s2 = vld1<T>(&delays[localStride]);
494 
495         KernelType kernel(
496                 vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride),
497                 vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3),
498                 vld1<CoefType>(coefs + coefStride * 4),
499                 s1, s2);
500         if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements;
501         kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride);
502         vst1(&delays[0], kernel.s_[0]);
503         vst1(&delays[localStride], kernel.s_[1]);
504         delays += elements;
505     }
506 }
507 
508 // Find the nearest occupancy mask that includes all the desired bits.
509 template <typename T, size_t N>
nearestOccupancy(T occupancy,const T (& occupancies)[N])510 static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) {
511     if (occupancy < 32) {
512         for (auto test : occupancies) {
513             if ((occupancy & test) == occupancy) return test;
514         }
515     }
516     return 31;
517 }
518 
519 enum FILTER_OPTION {
520     FILTER_OPTION_SCALAR_ONLY = (1 << 0),
521 };
522 
523 
524 /**
525  * DefaultBiquadConstOptions holds the default set of options for customizing
526  * the Biquad filter at compile time.
527  *
528  * Consider inheriting from this structure and overriding the options
529  * desired; this is backward compatible to new options added in the future.
530  */
531 struct DefaultBiquadConstOptions {
532 
533     // Sets the Biquad filter type.
534     // Can be one of the already defined BiquadDirect2Transpose or BiquadStateSpace
535     // filter kernels; also can be a user defined filter kernel as well.
536     template <typename T, typename F>
537     using FilterType = BiquadStateSpace<T, F, false /* SEPARATE_CHANNEL_OPTIMIZATION */>;
538 };
539 
540 #define BIQUAD_FILTER_CASE(N, ... /* type */) \
541             case N: { \
542                 using VectorType = __VA_ARGS__; \
543                 biquad_filter_func_impl<ConstOptions, \
544                         nearestOccupancy(OCCUPANCY, \
545                                 ConstOptions::template FilterType<VectorType, D> \
546                                             ::required_occupancies_), \
547                         SAME_COEF_PER_CHANNEL, VectorType>( \
548                         out + offset, in + offset, frames, stride, remaining, \
549                         delays + offset, c, localStride); \
550                 goto exit; \
551             }
552 
553 template <typename ConstOptions, size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_func(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride,FILTER_OPTION filterOptions)554 void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride,
555         size_t channelCount, D *delays, const D *coefs, size_t localStride,
556         FILTER_OPTION filterOptions) {
557     if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
558         zeroChannels(out, frames, stride, channelCount);
559         return;
560     }
561 
562     // Possible alternative intrinsic types for 2, 9, 15 float elements.
563     // using alt_2_t = struct {struct { float a; float b; } s; };
564     // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
565     // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
566 
567 #ifdef USE_NEON
568     // use NEON types to ensure we have the proper intrinsic acceleration.
569     using alt_16_t = float32x4x4_t;
570     using alt_8_t = float32x4x2_t;
571     using alt_4_t = float32x4_t;
572 #else
573     // Use C++ types, no NEON needed.
574     using alt_16_t = intrinsics::internal_array_t<float, 16>;
575     using alt_8_t = intrinsics::internal_array_t<float, 8>;
576     using alt_4_t = intrinsics::internal_array_t<float, 4>;
577 #endif
578 
579     for (size_t offset = 0; offset < channelCount; ) {
580         size_t remaining = channelCount - offset;
581         auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
582         if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar;
583         if constexpr (std::is_same_v<D, float>) {
584             switch (remaining) {
585             default:
586                 if (remaining >= 16) {
587                     remaining &= ~15;
588                     biquad_filter_func_impl<ConstOptions,
589                             nearestOccupancy(OCCUPANCY,
590                                     ConstOptions::template FilterType<D, D>
591                                                 ::required_occupancies_),
592                             SAME_COEF_PER_CHANNEL, alt_16_t>(
593                             out + offset, in + offset, frames, stride, remaining,
594                             delays + offset, c, localStride);
595                     offset += remaining;
596                     continue;
597                 }
598                 break;  // case 1 handled at bottom.
599             BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
600             BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
601             BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
602             BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
603             BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
604             BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
605             BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
606             BIQUAD_FILTER_CASE(8, alt_8_t)
607             BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
608             BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
609             BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
610             BIQUAD_FILTER_CASE(4, alt_4_t)
611             BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
612             BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
613             // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>)
614             }
615         } else if constexpr (std::is_same_v<D, double>) {
616 #if defined(__aarch64__)
617             switch (remaining) {
618             default:
619                 if (remaining >= 8) {
620                     remaining &= ~7;
621                     biquad_filter_func_impl<ConstOptions,
622                             nearestOccupancy(OCCUPANCY,
623                                      ConstOptions::template FilterType<D, D>
624                                                  ::required_occupancies_),
625                             SAME_COEF_PER_CHANNEL,
626                             intrinsics::internal_array_t<double, 8>>(
627                             out + offset, in + offset, frames, stride, remaining,
628                             delays + offset, c, localStride);
629                     offset += remaining;
630                     continue;
631                 }
632                 break; // case 1 handled at bottom.
633             BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
634             BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
635             BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
636             BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
637             BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
638             BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
639             };
640 #endif
641         }
642         scalar:
643         // Essentially the code below is scalar, the same as
644         // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
645         // but formulated with NEON intrinsic-like call pattern.
646         biquad_filter_func_impl<ConstOptions,
647                 nearestOccupancy(OCCUPANCY,
648                         ConstOptions::template FilterType<D, D>::required_occupancies_),
649                  SAME_COEF_PER_CHANNEL, D>(
650                 out + offset, in + offset, frames, stride, remaining,
651                 delays + offset, c, localStride);
652         offset += remaining;
653     }
654     exit:;
655 }
656 
657 } // namespace details
658 
659 /**
660  * BiquadFilter
661  *
662  * A multichannel Biquad filter implementation of the following transfer function.
663  *
664  * \f[
665  *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
666  *               { 1   + a_1 z^{-1} + a_2 z^{-2} }
667  * \f]
668  *
669  * <!--
670  *        b_0 + b_1 z^{-1} + b_2 z^{-2}
671  *  H(z)= -----------------------------
672  *        1 + a_1 z^{-1} + a_2 z^{-2}
673  * -->
674  *
675  *  Details:
676  *    1. The transposed direct type 2 implementation allows zeros to be computed
677  *       before poles in the internal state for improved filter precision and
678  *       better time-varying coefficient performance.
679  *    2. We optimize for zero coefficients using a compile-time generated function table.
680  *    3. We optimize for vector operations using column vector operations with stride
681  *       into interleaved audio data.
682  *    4. The denominator coefficients a_1 and a_2 are stored in positive form, despite the
683  *       negated form being slightly simpler for optimization (addition is commutative but
684  *       subtraction is not commutative).  This is to permit obtaining the coefficients
685  *       as a const reference.
686  *
687  *       Compatibility issue: Some Biquad libraries store the denominator coefficients
688  *       in negated form.  We explicitly negate before entering into the inner loop.
689  *    5. The full 6 coefficient Biquad filter form with a_0 != 1 may be used for setting
690  *       coefficients.  See setCoefficients() below.
691  *
692  * If SAME_COEFFICIENTS_PER_CHANNEL is false, then mCoefs is stored interleaved by channel.
693  *
694  * The Biquad filter update equation in transposed Direct form 2 is as follows:
695  *
696  * \f{eqnarray*}{
697  * y[n] &=& b0 * x[n] + s1[n - 1] \\
698  * s1[n] &=& s2[n - 1] + b1 * x[n] - a1 * y[n] \\
699  * s2[n] &=& b2 * x[n] - a2 * y[n]
700  * \f}
701  *
702  * For the transposed Direct form 2 update equation s1 and s2 represent the delay state
703  * contained in the internal vector mDelays[].  This is stored interleaved by channel.
704  *
705  * Use -ffast-math` to permit associative math optimizations to get non-zero optimization as
706  * we do not rely on strict C operator precedence and associativity here.
707  * TODO(b/159373530): Use compound statement scoped pragmas instead of `-ffast-math`.
708  *
709  * \param D type variable representing the data type, one of float or double.
710  *         The default is float.
711  * \param SAME_COEF_PER_CHANNEL bool which is true if all the Biquad coefficients
712  *         are shared between channels, or false if the Biquad coefficients
713  *         may differ between channels. The default is true.
714  */
715 
716 template <typename D = float,
717         bool SAME_COEF_PER_CHANNEL = true,
718         typename ConstOptions = details::DefaultBiquadConstOptions>
719 class BiquadFilter {
720 public:
721     template <typename T = std::array<D, kBiquadNumCoefs>>
722     explicit BiquadFilter(size_t channelCount,
723             const T& coefs = {}, bool optimized = true)
mChannelCount(channelCount)724             : mChannelCount(channelCount)
725             , mCoefs(kBiquadNumCoefs * (SAME_COEF_PER_CHANNEL ? 1 : mChannelCount))
726             , mDelays(channelCount * kBiquadNumDelays) {
727         setCoefficients(coefs, optimized);
728     }
729 
730     // copy constructors
BiquadFilter(const BiquadFilter<D,SAME_COEF_PER_CHANNEL> & other)731     BiquadFilter(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
732         *this = other;
733     }
734 
BiquadFilter(BiquadFilter<D,SAME_COEF_PER_CHANNEL> && other)735     BiquadFilter(BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
736         *this = std::move(other);
737     }
738 
739     // copy assignment
740     BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
741             const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
742         mChannelCount = other.mChannelCount;
743         mCoefs = other.mCoefs;
744         mDelays = other.mDelays;
745         return *this;
746     }
747 
748     BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
749             BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
750         mChannelCount = other.mChannelCount;
751         mCoefs = std::move(other.mCoefs);
752         mDelays = std::move(other.mDelays);
753         return *this;
754     }
755 
756     // operator overloads for equality tests
757     bool operator==(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
758         return mChannelCount == other.mChannelCount
759                 && mCoefs == other.mCoefs
760                 && mDelays == other.mDelays;
761     }
762 
763     bool operator!=(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
764         return !operator==(other);
765     }
766 
767     /**
768      * \brief Sets filter coefficients
769      *
770      * \param coefs  pointer to the filter coefficients array.
771      * \param optimized whether to use processor optimized function (optional, defaults true).
772      * \return true if the BiquadFilter is stable, otherwise, return false.
773      *
774      * The input coefficients are interpreted in the following manner:
775      *
776      * If size of container is 5 (normalized Biquad):
777      * coefs[0] is b0,
778      * coefs[1] is b1,
779      * coefs[2] is b2,
780      * coefs[3] is a1,
781      * coefs[4] is a2.
782      *
783      * \f[
784      *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
785      *               { 1   + a_1 z^{-1} + a_2 z^{-2} }
786      * \f]
787      * <!--
788      *        b_0 + b_1 z^{-1} + b_2 z^{-2}
789      *  H(z)= -----------------------------
790      *        1 + a_1 z^{-1} + a_2 z^{-2}
791      * -->
792      *
793      * If size of container is 6 (general Biquad):
794      * coefs[0] is b0,
795      * coefs[1] is b1,
796      * coefs[2] is b2,
797      * coefs[3] is a0,
798      * coefs[4] is a1,
799      * coefs[5] is a2.
800      *
801      * \f[
802      *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
803      *               { a_0 + a_1 z^{-1} + a_2 z^{-2} }
804      * \f]
805      * <!--
806      *        b_0 + b_1 z^{-1} + b_2 z^{-2}
807      *  H(z)= -----------------------------
808      *        a_0 + a_1 z^{-1} + a_2 z^{-2}
809      * -->
810      *
811      * The internal representation is a normalized Biquad.
812      */
813     template <typename T = std::array<D, kBiquadNumCoefs>>
814     bool setCoefficients(const T& coefs, bool optimized = true) {
815         if constexpr (SAME_COEF_PER_CHANNEL) {
816             details::setCoefficients<D, T>(
817                     mCoefs, 0 /* offset */, 1 /* stride */, 1 /* channelCount */, coefs);
818         } else {
819             if (coefs.size() == mCoefs.size()) {
820                 std::copy(coefs.begin(), coefs.end(), mCoefs.begin());
821             } else {
822                 details::setCoefficients<D, T>(
823                         mCoefs, 0 /* offset */, mChannelCount, mChannelCount, coefs);
824             }
825         }
826         setOptimization(optimized);
827         return isStable();
828     }
829 
830     /**
831      * Sets coefficients for one of the filter channels, specified by channelIndex.
832      *
833      * This method is only available if SAME_COEF_PER_CHANNEL is false.
834      *
835      * \param coefs the coefficients to set.
836      * \param channelIndex the particular channel index to set.
837      * \param optimized whether to use optimized function (optional, defaults true).
838      */
839     template <typename T = std::array<D, kBiquadNumCoefs>>
840     bool setCoefficients(const T& coefs, size_t channelIndex, bool optimized = true) {
841         static_assert(!SAME_COEF_PER_CHANNEL);
842 
843         details::setCoefficients<D, T>(
844                 mCoefs, channelIndex, mChannelCount, 1 /* channelCount */, coefs);
845         setOptimization(optimized);
846         return isStable();
847     }
848 
849     /**
850      * Returns the coefficients as a const vector reference.
851      *
852      * If multichannel and the template variable SAME_COEF_PER_CHANNEL is true,
853      * the coefficients are interleaved by channel.
854      */
getCoefficients()855     const std::vector<D>& getCoefficients() const {
856         return mCoefs;
857     }
858 
859     /**
860      * Returns true if the filter is stable.
861      *
862      * \param channelIndex ignored if SAME_COEF_PER_CHANNEL is true,
863      *        asserts if channelIndex >= channel count (zero based index).
864      */
865     bool isStable(size_t channelIndex = 0) const {
866         if constexpr (SAME_COEF_PER_CHANNEL) {
867             return details::isStable(mCoefs[3], mCoefs[4]);
868         } else {
869             assert(channelIndex < mChannelCount);
870             return details::isStable(
871                     mCoefs[3 * mChannelCount + channelIndex],
872                     mCoefs[4 * mChannelCount + channelIndex]);
873         }
874     }
875 
876     /**
877      * Updates the filter function based on processor optimization.
878      *
879      * \param optimized if true, enables Processor based optimization.
880      */
setOptimization(bool optimized)881     void setOptimization(bool optimized) {
882         // Determine which coefficients are nonzero as a bit field.
883         size_t category = 0;
884         for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
885             if constexpr (SAME_COEF_PER_CHANNEL) {
886                 category |= (mCoefs[i] != 0) << i;
887             } else {
888                 for (size_t j = 0; j < mChannelCount; ++j) {
889                     if (mCoefs[i * mChannelCount + j] != 0) {
890                         category |= 1 << i;
891                         break;
892                     }
893                 }
894             }
895         }
896 
897         // Select the proper filtering function from our array.
898         if (optimized) {
899             mFilterOptions = (details::FILTER_OPTION)
900                     (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY);
901         } else {
902              mFilterOptions = (details::FILTER_OPTION)
903                      (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY);
904         }
905         mFunc = mFilterFuncs[category];
906     }
907 
908     /**
909      * \brief Filters the input data
910      *
911      * \param out     pointer to the output data
912      * \param in      pointer to the input data
913      * \param frames  number of audio frames to be processed
914      */
process(D * out,const D * in,size_t frames)915     void process(D* out, const D* in, size_t frames) {
916         process(out, in, frames, mChannelCount);
917     }
918 
919     /**
920      * \brief Filters the input data with stride
921      *
922      * \param out     pointer to the output data
923      * \param in      pointer to the input data
924      * \param frames  number of audio frames to be processed
925      * \param stride  the total number of samples associated with a frame, if not channelCount.
926      */
process(D * out,const D * in,size_t frames,size_t stride)927     void process(D* out, const D* in, size_t frames, size_t stride) {
928         assert(stride >= mChannelCount);
929         mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
930                 mCoefs.data(), mChannelCount, mFilterOptions);
931     }
932 
933     /**
934      * EXPERIMENTAL:
935      * Processes 1D input data, with mChannel Biquads, using sliding window parallelism.
936      *
937      * Instead of considering mChannel Biquads as one-per-input channel, this method treats
938      * the mChannel biquads as applied in sequence to a single 1D input stream,
939      * with the last channel count Biquad being applied first.
940      *
941      * input audio data -> BQ_{n-1} -> BQ{n-2} -> BQ_{n-3} -> BQ_{0} -> output
942      *
943      * TODO: Make this code efficient for NEON and split the destination from the source.
944      *
945      * Theoretically this code should be much faster for 1D input if one has 4+ Biquads to be
946      * sequentially applied, but in practice it is *MUCH* slower.
947      * On NEON, the data cannot be written then read in-place without incurring
948      * memory stall penalties.  A shifting NEON holding register is required to make this
949      * a practical improvement.
950      */
process1D(D * inout,size_t frames)951     void process1D(D* inout, size_t frames) {
952         size_t remaining = mChannelCount;
953 #ifdef USE_NEON
954         // We apply NEON acceleration striped with 4 filters (channels) at once.
955         // Filters operations commute, nevertheless we apply the filters in order.
956         if (frames >= 2 * mChannelCount) {
957             constexpr size_t channelBlock = 4;
958             for (; remaining >= channelBlock; remaining -= channelBlock) {
959                 const size_t baseIdx = remaining - channelBlock;
960                 // This is the 1D accelerated method.
961                 // prime the data pipe.
962                 for (size_t i = 0; i < channelBlock - 1; ++i) {
963                     size_t fromEnd = remaining - i - 1;
964                     auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
965                     auto delays = mDelays.data() + fromEnd;
966                     mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
967                             delays, coefs, mChannelCount, mFilterOptions);
968                 }
969 
970                 auto delays = mDelays.data() + baseIdx;
971                 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : baseIdx);
972                 // Parallel processing - we use a sliding window doing channelBlock at once,
973                 // sliding one audio sample at a time.
974                 mFunc(inout, inout,
975                         frames - channelBlock + 1, 1 /* stride */, channelBlock,
976                         delays, coefs, mChannelCount, mFilterOptions);
977 
978                 // drain data pipe.
979                 for (size_t i = 1; i < channelBlock; ++i) {
980                     mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
981                             1 /* frames */, 1 /* stride */, channelBlock - i,
982                             delays, coefs, mChannelCount, mFilterOptions);
983                 }
984             }
985         }
986 #endif
987         // For short data sequences, we use the serial single channel logical equivalent
988         for (; remaining > 0; --remaining) {
989             size_t fromEnd = remaining - 1;
990             auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
991             mFunc(inout, inout,
992                     frames, 1 /* stride */, 1 /* channelCount */,
993                     mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions);
994         }
995     }
996 
997     /**
998      * \brief Clears the delay elements
999      *
1000      * This function clears the delay elements representing the filter state.
1001      */
clear()1002     void clear() {
1003         std::fill(mDelays.begin(), mDelays.end(), 0.f);
1004     }
1005 
1006     /**
1007      * \brief Sets the internal delays from a vector
1008      *
1009      * For a multichannel stream, the delays are interleaved by channel:
1010      * delays[2 * i + 0] is s1 of i-th channel,
1011      * delays[2 * i + 1] is s2 of i-th channel,
1012      * where index i runs from 0 to (mChannelCount - 1).
1013      *
1014      * \param delays reference to vector containing delays.
1015      */
setDelays(std::vector<D> & delays)1016     void setDelays(std::vector<D>& delays) {
1017         assert(delays.size() == mDelays.size());
1018         mDelays = std::move(delays);
1019     }
1020 
1021     /**
1022      * \brief Gets delay elements as a vector
1023      *
1024      * For a multichannel stream, the delays are interleaved by channel:
1025      * delays[2 * i + 0] is s1 of i-th channel,
1026      * delays[2 * i + 1] is s2 of i-th channel,
1027      * where index i runs from 0 to (mChannelCount - 1).
1028      *
1029      * \return a const vector reference of delays.
1030      */
getDelays()1031     const std::vector<D>& getDelays() const {
1032         return mDelays;
1033     }
1034 
1035 private:
1036     /* const */ size_t mChannelCount; // not const because we can assign to it on operator equals.
1037 
1038     /*
1039      * \var D mCoefs
1040      * \brief Stores the filter coefficients
1041      *
1042      * If SAME_COEF_PER_CHANNEL is false, the filter coefficients are stored
1043      * interleaved by channel.
1044      */
1045     std::vector<D> mCoefs;
1046 
1047     /**
1048      * \var D mDelays
1049      * \brief The delay state.
1050      *
1051      * The delays are stored channel interleaved in the following manner,
1052      * mDelays[2 * i + 0] is s1 of i-th channel
1053      * mDelays[2 * i + 1] is s2 of i-th channel
1054      * index i runs from 0 to (mChannelCount - 1).
1055      */
1056     std::vector<D> mDelays;
1057 
1058     details::FILTER_OPTION mFilterOptions{};
1059 
1060     // Consider making a separate delegation class.
1061     /*
1062      * We store an array of functions based on the occupancy.
1063      *
1064      * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
1065      * b0 b1 b2 a1 a2  (from lsb to msb)
1066      *
1067      *  static inline constexpr std::array<filter_func*, M> mArray = {
1068      *     biquad_filter_func<0>,
1069      *     biquad_filter_func<1>,
1070      *     biquad_filter_func<2>,
1071      *      ...
1072      *     biquad_filter_func<(1 << kBiquadNumCoefs) - 1>,
1073      *  };
1074      *
1075      * Every time the coefficients are changed, we select the processing function from
1076      * this table.
1077      */
1078 
1079     // Used to build the functional array.
1080     template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
1081     struct FuncWrap {
funcFuncWrap1082         static void func(D* out, const D *in, size_t frames, size_t stride,
1083                 size_t channelCount, D *delays, const D *coef, size_t localStride,
1084                 details::FILTER_OPTION filterOptions) {
1085             constexpr size_t NEAREST_OCCUPANCY =
1086                 details::nearestOccupancy(
1087                         OCCUPANCY, ConstOptions::template FilterType<D, D>
1088                                                ::required_occupancies_);
1089             details::biquad_filter_func<ConstOptions, NEAREST_OCCUPANCY, SC>(
1090                     out, in, frames, stride, channelCount, delays, coef, localStride,
1091                     filterOptions);
1092         }
1093     };
1094 
1095     // Vector optimized array of functions.
1096     static inline constexpr auto mFilterFuncs =
1097             details::make_functional_array<
1098                     FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
1099 
1100     /**
1101      * \var filter_func* mFunc
1102      *
1103      * The current filter function selected for the channel occupancy of the Biquad.
1104      * It will be one of mFilterFuncs.
1105      */
1106     std::decay_t<decltype(mFilterFuncs[0])> mFunc;
1107 };
1108 
1109 } // namespace android::audio_utils
1110 
1111 #pragma pop_macro("USE_DITHER")
1112 #pragma pop_macro("USE_NEON")
1113