1 /*
2 * Copyright (C) 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 *
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #pragma once
18
19 #include "intrinsic_utils.h"
20
21 #include <array>
22 #include <cmath>
23 #include <functional>
24 #include <utility>
25 #include <vector>
26
27 #include <assert.h>
28
29 // We conditionally include neon optimizations for ARM devices
30 #pragma push_macro("USE_NEON")
31 #undef USE_NEON
32
33 #if defined(__ARM_NEON__) || defined(__aarch64__)
34 #include <arm_neon.h>
35 #define USE_NEON
36 #endif
37
38 // Use dither to prevent subnormals for CPUs that raise an exception.
39 #pragma push_macro("USE_DITHER")
40 #undef USE_DITHER
41
42 #if defined(__i386__) || defined(__x86_x64__)
43 #define USE_DITHER
44 #endif
45
46 namespace android::audio_utils {
47
48 static constexpr size_t kBiquadNumCoefs = 5;
49 static constexpr size_t kBiquadNumDelays = 2;
50
51 /**
52 * The BiquadDirect2Transpose is a low overhead
53 * Biquad filter with coefficients b0, b1, b2, a1, a2.
54 *
55 * This can be used by itself, but it is preferred for best data management
56 * to use the BiquadFilter abstraction below.
57 *
58 * T is the data type (scalar or vector).
59 * F is the filter coefficient type. It is either a scalar or vector (matching T).
60 */
61 template <typename T, typename F>
62 struct BiquadDirect2Transpose {
63 F coef_[5]; // these are stored with the denominator a's negated.
64 T s_[2]; // delay states
65
66 // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
67 // as expressed by a bitmask.
68 static inline constexpr size_t required_occupancies_[] = {
69 0x1, // constant scale
70 0x3, // single zero
71 0x7, // double zero
72 0x9, // single pole
73 0xb, // (11) first order IIR
74 0x1b, // (27) double pole + single zero
75 0x1f, // (31) second order IIR (full Biquad)
76 };
77
78 // Take care the order of arguments - starts with b's then goes to a's.
79 // The a's are "positive" reference, some filters take negative.
80 BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
81 const T& s0 = {}, const T& s1 = {})
82 // : coef_{b0, b1, b2, -a1, -a2}
83 : coef_{ b0,
84 b1,
85 b2,
86 intrinsics::vneg(a1),
87 intrinsics::vneg(a2) }
88 , s_{ s0, s1 } {
89 }
90
91 // D is the data type. It must be the same element type of T or F.
92 // Take care the order of input and output.
93 template<typename D, size_t OCCUPANCY = 0x1f>
94 __attribute__((always_inline)) // required for 1ch speedup (30% faster)
processBiquadDirect2Transpose95 void process(D* output, const D* input, size_t frames, size_t stride) {
96 using namespace intrinsics;
97 // For SSE it is possible to vdup F to T if F is scalar.
98 const F b0 = coef_[0]; // b0
99 const F b1 = coef_[1]; // b1
100 const F b2 = coef_[2]; // b2
101 const F negativeA1 = coef_[3]; // -a1
102 const F negativeA2 = coef_[4]; // -a2
103 T s[2] = { s_[0], s_[1] };
104 T xn, yn; // OK to declare temps outside loop rather than at the point of initialization.
105 #ifdef USE_DITHER
106 constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
107 T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
108 #endif
109
110 // Unroll control. Make sure the constexpr remains constexpr :-).
111 constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
112 constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2; // below this won't be unrolled.
113 constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled.
114 constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
115 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
116 size_t remainder = 0;
117 if constexpr (UNROLL_LOOPS > 1) {
118 remainder = frames % UNROLL_LOOPS;
119 frames /= UNROLL_LOOPS;
120 }
121
122 // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
123 // The other alternative is to use a MACRO, but that doesn't read as well.
124 const auto KERNEL = [&]() __attribute__((always_inline)) {
125 xn = vld1<T>(input);
126 input += stride;
127 #ifdef USE_DITHER
128 xn = vadd(xn, dither);
129 dither = vneg(dither);
130 #endif
131
132 yn = s[0];
133 if constexpr (OCCUPANCY >> 0 & 1) {
134 yn = vmla(yn, b0, xn);
135 }
136 vst1(output, yn);
137 output += stride;
138
139 s[0] = s[1];
140 if constexpr (OCCUPANCY >> 3 & 1) {
141 s[0] = vmla(s[0], negativeA1, yn);
142 }
143 if constexpr (OCCUPANCY >> 1 & 1) {
144 s[0] = vmla(s[0], b1, xn);
145 }
146 if constexpr (OCCUPANCY >> 2 & 1) {
147 s[1] = vmul(b2, xn);
148 } else {
149 s[1] = vdupn<T>(0.f);
150 }
151 if constexpr (OCCUPANCY >> 4 & 1) {
152 s[1] = vmla(s[1], negativeA2, yn);
153 }
154 };
155
156 while (frames > 0) {
157 #pragma unroll
158 for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
159 KERNEL();
160 }
161 frames--;
162 }
163 if constexpr (UNROLL_LOOPS > 1) {
164 for (size_t i = 0; i < remainder; ++i) {
165 KERNEL();
166 }
167 }
168 s_[0] = s[0];
169 s_[1] = s[1];
170 }
171 };
172
173 /**
174 * A state space formulation of filtering converts a n-th order difference equation update
175 * to a first order vector difference equation. For the Biquad filter, the state space form
176 * has improved numerical precision properties with poles near the unit circle as well as
177 * increased speed due to better parallelization of the state update [1][2].
178 *
179 * [1] Raph Levien: (observerable canonical form)
180 * https://github.com/google/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
181 *
182 * [2] Julius O Smith III: (controllable canonical form)
183 * https://ccrma.stanford.edu/~jos/filters/State_Space_Filters.html
184 *
185 * The signal flow is as follows, where for scalar x and y, the matrix D is a scalar d.
186 *
187 *
188 * +------[ d ]--------------------------+
189 * | S |
190 * x ----+--[ B ]--(+)--[ z^-1 ]---+---[ C ]--(+)--- y
191 * | |
192 * +----[ A ]-----+
193 *
194 * The 2nd order Biquad IIR coefficients are as follows in observerable canonical form:
195 *
196 * y = [C_1 C_2] | S_1 | + d x
197 * | S_2 |
198 *
199 *
200 * | S_1 | = | A_11 A_12 | | S_1 | + | B_1 | x
201 * | S_2 | | A_21 A_22 | | S_2 | | B_2 |
202 *
203 *
204 * A_11 = -a1
205 * A_12 = 1
206 * A_21 = -a2
207 * A_22 = 0
208 *
209 * B_1 = b1 - b0 * a1
210 * B_2 = b2 - b0 * a2
211 *
212 * C_1 = 1
213 * C_2 = 0
214 *
215 * d = b0
216 *
217 * Implementation details: The state space filter is typically expressed in either observable or
218 * controllable canonical form [3]. We follow the observable form here.
219 * Raph [4] discovered that the single channel Biquad implementation can be further optimized
220 * by doing 2 filter updates at once (improving speed on NEON by about 20%).
221 * Doing 2 updates at once costs 8 multiplies / sample instead of 5 multiples / sample,
222 * but uses a 4 x 4 matrix multiply, exploiting single cycle multiply-add SIMD throughput.
223 *
224 * [3] Mathworks
225 * https://www.mathworks.com/help/control/ug/canonical-state-space-realizations.html
226 * [4] Raph Levien
227 * https://github.com/kysko/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
228 *
229 * The template variables
230 * T is the data type (scalar or vector).
231 * F is the filter coefficient type. It is either a scalar or vector (matching T).
232 */
233 template <typename T, typename F, bool SEPARATE_CHANNEL_OPTIMIZATION = false>
234 struct BiquadStateSpace {
235 F coef_[5]; // these are stored as state-space converted.
236 T s_[2]; // delay states
237
238 // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
239 // as expressed by a bitmask. This must include 31.
240 static inline constexpr size_t required_occupancies_[] = {
241 1, // constant scale
242 3, // single zero
243 7, // double zero
244 9, // single pole
245 11, // first order IIR
246 27, // double pole + single zero
247 31, // second order IIR (full Biquad)
248 };
249
250 // Take care the order of arguments - starts with b's then goes to a's.
251 // The a's are "positive" reference, some filters take negative.
252 BiquadStateSpace(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
253 const T& s0 = {}, const T& s1 = {})
254 // : coef_{b0, b1 - b0 * a1, b2 - b0 * a2, -a1, -a2}
255 : coef_{ b0,
256 intrinsics::vsub(b1, intrinsics::vmul(b0, a1)),
257 intrinsics::vsub(b2, intrinsics::vmul(b0, a2)),
258 intrinsics::vneg(a1),
259 intrinsics::vneg(a2) }
260 , s_{s0, s1} {
261 }
262
263 // D is the data type. It must be the same element type of T or F.
264 // Take care the order of input and output.
265 template<typename D, size_t OCCUPANCY = 0x1f>
processBiquadStateSpace266 void process(D* output, const D* input, size_t frames, size_t stride) {
267 using namespace intrinsics;
268 const F b0 = coef_[0]; // b0
269 const F b1ss = coef_[1]; // b1 - b0 * a1,
270 const F b2ss = coef_[2]; // b2 - b0 * a2,
271 const F negativeA1 = coef_[3]; // -a1
272 const F negativeA2 = coef_[4]; // -a2
273 T s[2] = { s_[0], s_[1] };
274 T x, new_s0; // OK to declare temps here rather than at the point of initialization.
275 #ifdef USE_DITHER
276 constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
277 T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
278 #endif
279 constexpr bool b0_present = (OCCUPANCY & 0x1) != 0;
280 constexpr bool a1_present = (OCCUPANCY & 0x8) != 0;
281 constexpr bool a2_present = (OCCUPANCY & 0x10) != 0;
282 constexpr bool b1ss_present = (OCCUPANCY & 0x2) != 0 ||
283 (b0_present && a1_present);
284 constexpr bool b2ss_present = (OCCUPANCY & 0x4) != 0 ||
285 (b0_present && a2_present);
286
287 // Unroll control. Make sure the constexpr remains constexpr :-).
288 constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
289 constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 1; // below this won't be unrolled.
290 constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled.
291 constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
292 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
293
294 if constexpr (SEPARATE_CHANNEL_OPTIMIZATION && CHANNELS == 1 && OCCUPANCY >= 11) {
295 // Special acceleration which computes 2 samples at a time.
296 // see reference [4] for computation of this matrix.
297 intrinsics::internal_array_t<T, 4> A[4] = {
298 {b0, b1ss, negativeA1 * b1ss + b2ss, negativeA2 * b1ss},
299 {0, b0, b1ss, b2ss},
300 {1, negativeA1, negativeA2 + negativeA1 * negativeA1, negativeA1 * negativeA2},
301 {0, 1, negativeA1, negativeA2},
302 };
303 intrinsics::internal_array_t<T, 4> y;
304 while (frames > 1) {
305 x = vld1<T>(input);
306 input += stride;
307 #ifdef USE_DITHER
308 x = vadd(x, dither);
309 dither = vneg(dither);
310 #endif
311 y = vmul(A[0], x);
312
313 x = vld1<T>(input);
314 input += stride;
315 #ifdef USE_DITHER
316 x = vadd(x, dither);
317 dither = vneg(dither);
318 #endif
319 y = vmla(y, A[1], x);
320 y = vmla(y, A[2], s[0]);
321 y = vmla(y, A[3], s[1]);
322
323 vst1(output, y.v[0]);
324 output += stride;
325
326 vst1(output, y.v[1]);
327 output += stride;
328
329 s[0] = y.v[2];
330 s[1] = y.v[3];
331 frames -= 2;
332 }
333 if (frames == 0) {
334 s_[0] = s[0];
335 s_[1] = s[1];
336 return;
337 }
338 }
339
340 size_t remainder = 0;
341 if constexpr (UNROLL_LOOPS > 1) {
342 remainder = frames % UNROLL_LOOPS;
343 frames /= UNROLL_LOOPS;
344 }
345
346 // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
347 // The other alternative is to use a MACRO, but that doesn't read as well.
348 const auto KERNEL = [&]() __attribute__((always_inline)) {
349 x = vld1<T>(input);
350 input += stride;
351 #ifdef USE_DITHER
352 x = vadd(x, dither);
353 dither = vneg(dither);
354 #endif
355 // vst1(output, vadd(s[0], vmul(b0, x)));
356 // output += stride;
357 // new_s0 = vadd(vadd(vmul(b1ss, x), vmul(negativeA1, s[0])), s[1]);
358 // s[1] = vadd(vmul(b2ss, x), vmul(negativeA2, s[0]));
359
360 if constexpr (b0_present) {
361 vst1(output, vadd(s[0], vmul(b0, x)));
362 } else /* constexpr */ {
363 vst1(output, s[0]);
364 }
365 output += stride;
366 new_s0 = s[1];
367 if constexpr (b1ss_present) {
368 new_s0 = vadd(new_s0, vmul(b1ss, x));
369 }
370 if constexpr (a1_present) {
371 new_s0 = vadd(new_s0, vmul(negativeA1, s[0]));
372 }
373 if constexpr (b2ss_present) {
374 s[1] = vmul(b2ss, x);
375 if constexpr (a2_present) {
376 s[1] = vadd(s[1], vmul(negativeA2, s[0]));
377 }
378 } else if constexpr (a2_present) {
379 s[1] = vmul(negativeA2, s[0]);
380 }
381 s[0] = new_s0;
382 };
383
384 while (frames > 0) {
385 #pragma unroll
386 for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
387 KERNEL();
388 }
389 frames--;
390 }
391 if constexpr (UNROLL_LOOPS > 1) {
392 for (size_t i = 0; i < remainder; ++i) {
393 KERNEL();
394 }
395 }
396 s_[0] = s[0];
397 s_[1] = s[1];
398 }
399 };
400
401 namespace details {
402
403 // Helper methods for constructing a constexpr array of function pointers.
404 // As function pointers are efficient and have no constructor/destructor
405 // this is preferred over std::function.
406 //
407 // SC stands for SAME_COEF_PER_CHANNEL, a compile time boolean constant.
408 template <template <size_t, bool, typename ...> typename F, bool SC, size_t... Is>
make_functional_array_from_index_sequence(std::index_sequence<Is...>)409 static inline constexpr auto make_functional_array_from_index_sequence(std::index_sequence<Is...>) {
410 using first_t = decltype(&F<0, false>::func); // type from function
411 using result_t = std::array<first_t, sizeof...(Is)>; // type of array
412 return result_t{{F<Is, SC>::func...}}; // initialize with functions.
413 }
414
415 template <template <size_t, bool, typename ...> typename F, size_t M, bool SC>
make_functional_array()416 static inline constexpr auto make_functional_array() {
417 return make_functional_array_from_index_sequence<F, SC>(std::make_index_sequence<M>());
418 }
419
420 // Returns true if the poles are stable for a Biquad.
421 template <typename D>
isStable(const D & a1,const D & a2)422 static inline constexpr bool isStable(const D& a1, const D& a2) {
423 return fabs(a2) < D(1) && fabs(a1) < D(1) + a2;
424 }
425
426 // Simplifies Biquad coefficients.
427 // TODO: consider matched pole/zero cancellation.
428 // consider subnormal elimination for Intel processors.
429 template <typename D, typename T>
reduceCoefficients(const T & coef)430 std::array<D, kBiquadNumCoefs> reduceCoefficients(const T& coef) {
431 std::array<D, kBiquadNumCoefs> lcoef;
432 if (coef.size() == kBiquadNumCoefs + 1) {
433 // General form of Biquad.
434 // Remove matched z^-1 factors in top and bottom (e.g. coefs[0] == coefs[3] == 0).
435 size_t offset = 0;
436 for (; offset < 2 && coef[offset] == 0 && coef[offset + 3] == 0; ++offset);
437 assert(coefs[offset + 3] != 0); // hmm... shouldn't we be causal?
438
439 // Normalize 6 coefficients to 5 for storage.
440 lcoef[0] = coef[offset] / coef[offset + 3];
441 for (size_t i = 1; i + offset < 3; ++i) {
442 lcoef[i] = coef[i + offset] / coef[offset + 3];
443 lcoef[i + 2] = coef[i + offset + 3] / coef[offset + 3];
444 }
445 } else if (coef.size() == kBiquadNumCoefs) {
446 std::copy(coef.begin(), coef.end(), lcoef.begin());
447 } else {
448 assert(coef.size() == kBiquadNumCoefs + 1 || coef.size() == kBiquadNumCoefs);
449 }
450 return lcoef;
451 }
452
453 // Sets a container of coefficients to storage.
454 template <typename D, typename T, typename DEST>
setCoefficients(DEST & dest,size_t offset,size_t stride,size_t channelCount,const T & coef)455 static inline void setCoefficients(
456 DEST& dest, size_t offset, size_t stride, size_t channelCount, const T& coef) {
457 auto lcoef = reduceCoefficients<D, T>(coef);
458 // replicate as needed
459 for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
460 for (size_t j = 0; j < channelCount; ++j) {
461 dest[i * stride + offset + j] = lcoef[i];
462 }
463 }
464 }
465
466 // Helper function to zero channels in the input buffer.
467 // This is used for the degenerate coefficient case which results in all zeroes.
468 template <typename D>
zeroChannels(D * out,size_t frames,size_t stride,size_t channelCount)469 void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) {
470 if (stride == channelCount) {
471 memset(out, 0, sizeof(float) * frames * channelCount);
472 } else {
473 for (size_t i = 0; i < frames; i++) {
474 memset(out, 0, sizeof(float) * channelCount);
475 out += stride;
476 }
477 }
478 }
479
480 template <typename ConstOptions,
481 size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
biquad_filter_func_impl(F * out,const F * in,size_t frames,size_t stride,size_t channelCount,F * delays,const F * coefs,size_t localStride)482 void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride,
483 size_t channelCount, F *delays, const F *coefs, size_t localStride) {
484 using namespace android::audio_utils::intrinsics;
485
486 constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
487 const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride;
488 using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>;
489 using KernelType = typename ConstOptions::template FilterType<T, CoefType>;
490
491 for (size_t i = 0; i < channelCount; i += elements) {
492 T s1 = vld1<T>(&delays[0]);
493 T s2 = vld1<T>(&delays[localStride]);
494
495 KernelType kernel(
496 vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride),
497 vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3),
498 vld1<CoefType>(coefs + coefStride * 4),
499 s1, s2);
500 if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements;
501 kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride);
502 vst1(&delays[0], kernel.s_[0]);
503 vst1(&delays[localStride], kernel.s_[1]);
504 delays += elements;
505 }
506 }
507
508 // Find the nearest occupancy mask that includes all the desired bits.
509 template <typename T, size_t N>
nearestOccupancy(T occupancy,const T (& occupancies)[N])510 static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) {
511 if (occupancy < 32) {
512 for (auto test : occupancies) {
513 if ((occupancy & test) == occupancy) return test;
514 }
515 }
516 return 31;
517 }
518
519 enum FILTER_OPTION {
520 FILTER_OPTION_SCALAR_ONLY = (1 << 0),
521 };
522
523
524 /**
525 * DefaultBiquadConstOptions holds the default set of options for customizing
526 * the Biquad filter at compile time.
527 *
528 * Consider inheriting from this structure and overriding the options
529 * desired; this is backward compatible to new options added in the future.
530 */
531 struct DefaultBiquadConstOptions {
532
533 // Sets the Biquad filter type.
534 // Can be one of the already defined BiquadDirect2Transpose or BiquadStateSpace
535 // filter kernels; also can be a user defined filter kernel as well.
536 template <typename T, typename F>
537 using FilterType = BiquadStateSpace<T, F, false /* SEPARATE_CHANNEL_OPTIMIZATION */>;
538 };
539
540 #define BIQUAD_FILTER_CASE(N, ... /* type */) \
541 case N: { \
542 using VectorType = __VA_ARGS__; \
543 biquad_filter_func_impl<ConstOptions, \
544 nearestOccupancy(OCCUPANCY, \
545 ConstOptions::template FilterType<VectorType, D> \
546 ::required_occupancies_), \
547 SAME_COEF_PER_CHANNEL, VectorType>( \
548 out + offset, in + offset, frames, stride, remaining, \
549 delays + offset, c, localStride); \
550 goto exit; \
551 }
552
553 template <typename ConstOptions, size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_func(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride,FILTER_OPTION filterOptions)554 void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride,
555 size_t channelCount, D *delays, const D *coefs, size_t localStride,
556 FILTER_OPTION filterOptions) {
557 if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
558 zeroChannels(out, frames, stride, channelCount);
559 return;
560 }
561
562 // Possible alternative intrinsic types for 2, 9, 15 float elements.
563 // using alt_2_t = struct {struct { float a; float b; } s; };
564 // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
565 // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
566
567 #ifdef USE_NEON
568 // use NEON types to ensure we have the proper intrinsic acceleration.
569 using alt_16_t = float32x4x4_t;
570 using alt_8_t = float32x4x2_t;
571 using alt_4_t = float32x4_t;
572 #else
573 // Use C++ types, no NEON needed.
574 using alt_16_t = intrinsics::internal_array_t<float, 16>;
575 using alt_8_t = intrinsics::internal_array_t<float, 8>;
576 using alt_4_t = intrinsics::internal_array_t<float, 4>;
577 #endif
578
579 for (size_t offset = 0; offset < channelCount; ) {
580 size_t remaining = channelCount - offset;
581 auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
582 if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar;
583 if constexpr (std::is_same_v<D, float>) {
584 switch (remaining) {
585 default:
586 if (remaining >= 16) {
587 remaining &= ~15;
588 biquad_filter_func_impl<ConstOptions,
589 nearestOccupancy(OCCUPANCY,
590 ConstOptions::template FilterType<D, D>
591 ::required_occupancies_),
592 SAME_COEF_PER_CHANNEL, alt_16_t>(
593 out + offset, in + offset, frames, stride, remaining,
594 delays + offset, c, localStride);
595 offset += remaining;
596 continue;
597 }
598 break; // case 1 handled at bottom.
599 BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
600 BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
601 BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
602 BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
603 BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
604 BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
605 BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
606 BIQUAD_FILTER_CASE(8, alt_8_t)
607 BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
608 BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
609 BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
610 BIQUAD_FILTER_CASE(4, alt_4_t)
611 BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
612 BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
613 // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>)
614 }
615 } else if constexpr (std::is_same_v<D, double>) {
616 #if defined(__aarch64__)
617 switch (remaining) {
618 default:
619 if (remaining >= 8) {
620 remaining &= ~7;
621 biquad_filter_func_impl<ConstOptions,
622 nearestOccupancy(OCCUPANCY,
623 ConstOptions::template FilterType<D, D>
624 ::required_occupancies_),
625 SAME_COEF_PER_CHANNEL,
626 intrinsics::internal_array_t<double, 8>>(
627 out + offset, in + offset, frames, stride, remaining,
628 delays + offset, c, localStride);
629 offset += remaining;
630 continue;
631 }
632 break; // case 1 handled at bottom.
633 BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
634 BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
635 BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
636 BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
637 BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
638 BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
639 };
640 #endif
641 }
642 scalar:
643 // Essentially the code below is scalar, the same as
644 // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
645 // but formulated with NEON intrinsic-like call pattern.
646 biquad_filter_func_impl<ConstOptions,
647 nearestOccupancy(OCCUPANCY,
648 ConstOptions::template FilterType<D, D>::required_occupancies_),
649 SAME_COEF_PER_CHANNEL, D>(
650 out + offset, in + offset, frames, stride, remaining,
651 delays + offset, c, localStride);
652 offset += remaining;
653 }
654 exit:;
655 }
656
657 } // namespace details
658
659 /**
660 * BiquadFilter
661 *
662 * A multichannel Biquad filter implementation of the following transfer function.
663 *
664 * \f[
665 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
666 * { 1 + a_1 z^{-1} + a_2 z^{-2} }
667 * \f]
668 *
669 * <!--
670 * b_0 + b_1 z^{-1} + b_2 z^{-2}
671 * H(z)= -----------------------------
672 * 1 + a_1 z^{-1} + a_2 z^{-2}
673 * -->
674 *
675 * Details:
676 * 1. The transposed direct type 2 implementation allows zeros to be computed
677 * before poles in the internal state for improved filter precision and
678 * better time-varying coefficient performance.
679 * 2. We optimize for zero coefficients using a compile-time generated function table.
680 * 3. We optimize for vector operations using column vector operations with stride
681 * into interleaved audio data.
682 * 4. The denominator coefficients a_1 and a_2 are stored in positive form, despite the
683 * negated form being slightly simpler for optimization (addition is commutative but
684 * subtraction is not commutative). This is to permit obtaining the coefficients
685 * as a const reference.
686 *
687 * Compatibility issue: Some Biquad libraries store the denominator coefficients
688 * in negated form. We explicitly negate before entering into the inner loop.
689 * 5. The full 6 coefficient Biquad filter form with a_0 != 1 may be used for setting
690 * coefficients. See setCoefficients() below.
691 *
692 * If SAME_COEFFICIENTS_PER_CHANNEL is false, then mCoefs is stored interleaved by channel.
693 *
694 * The Biquad filter update equation in transposed Direct form 2 is as follows:
695 *
696 * \f{eqnarray*}{
697 * y[n] &=& b0 * x[n] + s1[n - 1] \\
698 * s1[n] &=& s2[n - 1] + b1 * x[n] - a1 * y[n] \\
699 * s2[n] &=& b2 * x[n] - a2 * y[n]
700 * \f}
701 *
702 * For the transposed Direct form 2 update equation s1 and s2 represent the delay state
703 * contained in the internal vector mDelays[]. This is stored interleaved by channel.
704 *
705 * Use -ffast-math` to permit associative math optimizations to get non-zero optimization as
706 * we do not rely on strict C operator precedence and associativity here.
707 * TODO(b/159373530): Use compound statement scoped pragmas instead of `-ffast-math`.
708 *
709 * \param D type variable representing the data type, one of float or double.
710 * The default is float.
711 * \param SAME_COEF_PER_CHANNEL bool which is true if all the Biquad coefficients
712 * are shared between channels, or false if the Biquad coefficients
713 * may differ between channels. The default is true.
714 */
715
716 template <typename D = float,
717 bool SAME_COEF_PER_CHANNEL = true,
718 typename ConstOptions = details::DefaultBiquadConstOptions>
719 class BiquadFilter {
720 public:
721 template <typename T = std::array<D, kBiquadNumCoefs>>
722 explicit BiquadFilter(size_t channelCount,
723 const T& coefs = {}, bool optimized = true)
mChannelCount(channelCount)724 : mChannelCount(channelCount)
725 , mCoefs(kBiquadNumCoefs * (SAME_COEF_PER_CHANNEL ? 1 : mChannelCount))
726 , mDelays(channelCount * kBiquadNumDelays) {
727 setCoefficients(coefs, optimized);
728 }
729
730 // copy constructors
BiquadFilter(const BiquadFilter<D,SAME_COEF_PER_CHANNEL> & other)731 BiquadFilter(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
732 *this = other;
733 }
734
BiquadFilter(BiquadFilter<D,SAME_COEF_PER_CHANNEL> && other)735 BiquadFilter(BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
736 *this = std::move(other);
737 }
738
739 // copy assignment
740 BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
741 const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
742 mChannelCount = other.mChannelCount;
743 mCoefs = other.mCoefs;
744 mDelays = other.mDelays;
745 return *this;
746 }
747
748 BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
749 BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
750 mChannelCount = other.mChannelCount;
751 mCoefs = std::move(other.mCoefs);
752 mDelays = std::move(other.mDelays);
753 return *this;
754 }
755
756 // operator overloads for equality tests
757 bool operator==(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
758 return mChannelCount == other.mChannelCount
759 && mCoefs == other.mCoefs
760 && mDelays == other.mDelays;
761 }
762
763 bool operator!=(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
764 return !operator==(other);
765 }
766
767 /**
768 * \brief Sets filter coefficients
769 *
770 * \param coefs pointer to the filter coefficients array.
771 * \param optimized whether to use processor optimized function (optional, defaults true).
772 * \return true if the BiquadFilter is stable, otherwise, return false.
773 *
774 * The input coefficients are interpreted in the following manner:
775 *
776 * If size of container is 5 (normalized Biquad):
777 * coefs[0] is b0,
778 * coefs[1] is b1,
779 * coefs[2] is b2,
780 * coefs[3] is a1,
781 * coefs[4] is a2.
782 *
783 * \f[
784 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
785 * { 1 + a_1 z^{-1} + a_2 z^{-2} }
786 * \f]
787 * <!--
788 * b_0 + b_1 z^{-1} + b_2 z^{-2}
789 * H(z)= -----------------------------
790 * 1 + a_1 z^{-1} + a_2 z^{-2}
791 * -->
792 *
793 * If size of container is 6 (general Biquad):
794 * coefs[0] is b0,
795 * coefs[1] is b1,
796 * coefs[2] is b2,
797 * coefs[3] is a0,
798 * coefs[4] is a1,
799 * coefs[5] is a2.
800 *
801 * \f[
802 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
803 * { a_0 + a_1 z^{-1} + a_2 z^{-2} }
804 * \f]
805 * <!--
806 * b_0 + b_1 z^{-1} + b_2 z^{-2}
807 * H(z)= -----------------------------
808 * a_0 + a_1 z^{-1} + a_2 z^{-2}
809 * -->
810 *
811 * The internal representation is a normalized Biquad.
812 */
813 template <typename T = std::array<D, kBiquadNumCoefs>>
814 bool setCoefficients(const T& coefs, bool optimized = true) {
815 if constexpr (SAME_COEF_PER_CHANNEL) {
816 details::setCoefficients<D, T>(
817 mCoefs, 0 /* offset */, 1 /* stride */, 1 /* channelCount */, coefs);
818 } else {
819 if (coefs.size() == mCoefs.size()) {
820 std::copy(coefs.begin(), coefs.end(), mCoefs.begin());
821 } else {
822 details::setCoefficients<D, T>(
823 mCoefs, 0 /* offset */, mChannelCount, mChannelCount, coefs);
824 }
825 }
826 setOptimization(optimized);
827 return isStable();
828 }
829
830 /**
831 * Sets coefficients for one of the filter channels, specified by channelIndex.
832 *
833 * This method is only available if SAME_COEF_PER_CHANNEL is false.
834 *
835 * \param coefs the coefficients to set.
836 * \param channelIndex the particular channel index to set.
837 * \param optimized whether to use optimized function (optional, defaults true).
838 */
839 template <typename T = std::array<D, kBiquadNumCoefs>>
840 bool setCoefficients(const T& coefs, size_t channelIndex, bool optimized = true) {
841 static_assert(!SAME_COEF_PER_CHANNEL);
842
843 details::setCoefficients<D, T>(
844 mCoefs, channelIndex, mChannelCount, 1 /* channelCount */, coefs);
845 setOptimization(optimized);
846 return isStable();
847 }
848
849 /**
850 * Returns the coefficients as a const vector reference.
851 *
852 * If multichannel and the template variable SAME_COEF_PER_CHANNEL is true,
853 * the coefficients are interleaved by channel.
854 */
getCoefficients()855 const std::vector<D>& getCoefficients() const {
856 return mCoefs;
857 }
858
859 /**
860 * Returns true if the filter is stable.
861 *
862 * \param channelIndex ignored if SAME_COEF_PER_CHANNEL is true,
863 * asserts if channelIndex >= channel count (zero based index).
864 */
865 bool isStable(size_t channelIndex = 0) const {
866 if constexpr (SAME_COEF_PER_CHANNEL) {
867 return details::isStable(mCoefs[3], mCoefs[4]);
868 } else {
869 assert(channelIndex < mChannelCount);
870 return details::isStable(
871 mCoefs[3 * mChannelCount + channelIndex],
872 mCoefs[4 * mChannelCount + channelIndex]);
873 }
874 }
875
876 /**
877 * Updates the filter function based on processor optimization.
878 *
879 * \param optimized if true, enables Processor based optimization.
880 */
setOptimization(bool optimized)881 void setOptimization(bool optimized) {
882 // Determine which coefficients are nonzero as a bit field.
883 size_t category = 0;
884 for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
885 if constexpr (SAME_COEF_PER_CHANNEL) {
886 category |= (mCoefs[i] != 0) << i;
887 } else {
888 for (size_t j = 0; j < mChannelCount; ++j) {
889 if (mCoefs[i * mChannelCount + j] != 0) {
890 category |= 1 << i;
891 break;
892 }
893 }
894 }
895 }
896
897 // Select the proper filtering function from our array.
898 if (optimized) {
899 mFilterOptions = (details::FILTER_OPTION)
900 (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY);
901 } else {
902 mFilterOptions = (details::FILTER_OPTION)
903 (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY);
904 }
905 mFunc = mFilterFuncs[category];
906 }
907
908 /**
909 * \brief Filters the input data
910 *
911 * \param out pointer to the output data
912 * \param in pointer to the input data
913 * \param frames number of audio frames to be processed
914 */
process(D * out,const D * in,size_t frames)915 void process(D* out, const D* in, size_t frames) {
916 process(out, in, frames, mChannelCount);
917 }
918
919 /**
920 * \brief Filters the input data with stride
921 *
922 * \param out pointer to the output data
923 * \param in pointer to the input data
924 * \param frames number of audio frames to be processed
925 * \param stride the total number of samples associated with a frame, if not channelCount.
926 */
process(D * out,const D * in,size_t frames,size_t stride)927 void process(D* out, const D* in, size_t frames, size_t stride) {
928 assert(stride >= mChannelCount);
929 mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
930 mCoefs.data(), mChannelCount, mFilterOptions);
931 }
932
933 /**
934 * EXPERIMENTAL:
935 * Processes 1D input data, with mChannel Biquads, using sliding window parallelism.
936 *
937 * Instead of considering mChannel Biquads as one-per-input channel, this method treats
938 * the mChannel biquads as applied in sequence to a single 1D input stream,
939 * with the last channel count Biquad being applied first.
940 *
941 * input audio data -> BQ_{n-1} -> BQ{n-2} -> BQ_{n-3} -> BQ_{0} -> output
942 *
943 * TODO: Make this code efficient for NEON and split the destination from the source.
944 *
945 * Theoretically this code should be much faster for 1D input if one has 4+ Biquads to be
946 * sequentially applied, but in practice it is *MUCH* slower.
947 * On NEON, the data cannot be written then read in-place without incurring
948 * memory stall penalties. A shifting NEON holding register is required to make this
949 * a practical improvement.
950 */
process1D(D * inout,size_t frames)951 void process1D(D* inout, size_t frames) {
952 size_t remaining = mChannelCount;
953 #ifdef USE_NEON
954 // We apply NEON acceleration striped with 4 filters (channels) at once.
955 // Filters operations commute, nevertheless we apply the filters in order.
956 if (frames >= 2 * mChannelCount) {
957 constexpr size_t channelBlock = 4;
958 for (; remaining >= channelBlock; remaining -= channelBlock) {
959 const size_t baseIdx = remaining - channelBlock;
960 // This is the 1D accelerated method.
961 // prime the data pipe.
962 for (size_t i = 0; i < channelBlock - 1; ++i) {
963 size_t fromEnd = remaining - i - 1;
964 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
965 auto delays = mDelays.data() + fromEnd;
966 mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
967 delays, coefs, mChannelCount, mFilterOptions);
968 }
969
970 auto delays = mDelays.data() + baseIdx;
971 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : baseIdx);
972 // Parallel processing - we use a sliding window doing channelBlock at once,
973 // sliding one audio sample at a time.
974 mFunc(inout, inout,
975 frames - channelBlock + 1, 1 /* stride */, channelBlock,
976 delays, coefs, mChannelCount, mFilterOptions);
977
978 // drain data pipe.
979 for (size_t i = 1; i < channelBlock; ++i) {
980 mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
981 1 /* frames */, 1 /* stride */, channelBlock - i,
982 delays, coefs, mChannelCount, mFilterOptions);
983 }
984 }
985 }
986 #endif
987 // For short data sequences, we use the serial single channel logical equivalent
988 for (; remaining > 0; --remaining) {
989 size_t fromEnd = remaining - 1;
990 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
991 mFunc(inout, inout,
992 frames, 1 /* stride */, 1 /* channelCount */,
993 mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions);
994 }
995 }
996
997 /**
998 * \brief Clears the delay elements
999 *
1000 * This function clears the delay elements representing the filter state.
1001 */
clear()1002 void clear() {
1003 std::fill(mDelays.begin(), mDelays.end(), 0.f);
1004 }
1005
1006 /**
1007 * \brief Sets the internal delays from a vector
1008 *
1009 * For a multichannel stream, the delays are interleaved by channel:
1010 * delays[2 * i + 0] is s1 of i-th channel,
1011 * delays[2 * i + 1] is s2 of i-th channel,
1012 * where index i runs from 0 to (mChannelCount - 1).
1013 *
1014 * \param delays reference to vector containing delays.
1015 */
setDelays(std::vector<D> & delays)1016 void setDelays(std::vector<D>& delays) {
1017 assert(delays.size() == mDelays.size());
1018 mDelays = std::move(delays);
1019 }
1020
1021 /**
1022 * \brief Gets delay elements as a vector
1023 *
1024 * For a multichannel stream, the delays are interleaved by channel:
1025 * delays[2 * i + 0] is s1 of i-th channel,
1026 * delays[2 * i + 1] is s2 of i-th channel,
1027 * where index i runs from 0 to (mChannelCount - 1).
1028 *
1029 * \return a const vector reference of delays.
1030 */
getDelays()1031 const std::vector<D>& getDelays() const {
1032 return mDelays;
1033 }
1034
1035 private:
1036 /* const */ size_t mChannelCount; // not const because we can assign to it on operator equals.
1037
1038 /*
1039 * \var D mCoefs
1040 * \brief Stores the filter coefficients
1041 *
1042 * If SAME_COEF_PER_CHANNEL is false, the filter coefficients are stored
1043 * interleaved by channel.
1044 */
1045 std::vector<D> mCoefs;
1046
1047 /**
1048 * \var D mDelays
1049 * \brief The delay state.
1050 *
1051 * The delays are stored channel interleaved in the following manner,
1052 * mDelays[2 * i + 0] is s1 of i-th channel
1053 * mDelays[2 * i + 1] is s2 of i-th channel
1054 * index i runs from 0 to (mChannelCount - 1).
1055 */
1056 std::vector<D> mDelays;
1057
1058 details::FILTER_OPTION mFilterOptions{};
1059
1060 // Consider making a separate delegation class.
1061 /*
1062 * We store an array of functions based on the occupancy.
1063 *
1064 * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
1065 * b0 b1 b2 a1 a2 (from lsb to msb)
1066 *
1067 * static inline constexpr std::array<filter_func*, M> mArray = {
1068 * biquad_filter_func<0>,
1069 * biquad_filter_func<1>,
1070 * biquad_filter_func<2>,
1071 * ...
1072 * biquad_filter_func<(1 << kBiquadNumCoefs) - 1>,
1073 * };
1074 *
1075 * Every time the coefficients are changed, we select the processing function from
1076 * this table.
1077 */
1078
1079 // Used to build the functional array.
1080 template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
1081 struct FuncWrap {
funcFuncWrap1082 static void func(D* out, const D *in, size_t frames, size_t stride,
1083 size_t channelCount, D *delays, const D *coef, size_t localStride,
1084 details::FILTER_OPTION filterOptions) {
1085 constexpr size_t NEAREST_OCCUPANCY =
1086 details::nearestOccupancy(
1087 OCCUPANCY, ConstOptions::template FilterType<D, D>
1088 ::required_occupancies_);
1089 details::biquad_filter_func<ConstOptions, NEAREST_OCCUPANCY, SC>(
1090 out, in, frames, stride, channelCount, delays, coef, localStride,
1091 filterOptions);
1092 }
1093 };
1094
1095 // Vector optimized array of functions.
1096 static inline constexpr auto mFilterFuncs =
1097 details::make_functional_array<
1098 FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
1099
1100 /**
1101 * \var filter_func* mFunc
1102 *
1103 * The current filter function selected for the channel occupancy of the Biquad.
1104 * It will be one of mFilterFuncs.
1105 */
1106 std::decay_t<decltype(mFilterFuncs[0])> mFunc;
1107 };
1108
1109 } // namespace android::audio_utils
1110
1111 #pragma pop_macro("USE_DITHER")
1112 #pragma pop_macro("USE_NEON")
1113