1 /*
2 * Copyright 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
18 #define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
19
20 #include <array> // std::size
21 #include <type_traits>
22
23 /*
24 The intrinsics utility library contain helper functions for wide width DSP support.
25 We use templated types to allow testing from scalar to vector values.
26
27 See the Eigen project for general abstracted linear algebra acceleration.
28 http://eigen.tuxfamily.org/
29 */
30
31 // We conditionally include neon optimizations for ARM devices
32 #pragma push_macro("USE_NEON")
33 #undef USE_NEON
34
35 #if defined(__ARM_NEON__) || defined(__aarch64__)
36 #include <arm_neon.h>
37 #define USE_NEON
38 #endif
39
40 namespace android::audio_utils::intrinsics {
41
42 // For static assert(false) we need a template version to avoid early failure.
43 // See: https://stackoverflow.com/questions/51523965/template-dependent-false
44 template <typename T>
45 inline constexpr bool dependent_false_v = false;
46
47 // Type of array embedded in a struct that is usable in the Neon template functions below.
48 // This type must satisfy std::is_array_v<>.
49 template<typename T, size_t N>
50 struct internal_array_t {
51 T v[N];
sizeinternal_array_t52 static constexpr size_t size() { return N; }
53 };
54
55 // Detect if the value is directly addressable as an array.
56 // This is more advanced than std::is_array and works with neon intrinsics.
57 template<typename T>
requires(T a)58 concept is_array_like = requires(T a) {
59 a[0]; // can index first element
60 };
61
62 // Vector convert between type T to type S.
63 template <typename S, typename T>
vconvert(const T & in)64 inline S vconvert(const T& in) {
65 S out;
66
67 if constexpr (is_array_like<S>) {
68 if constexpr (is_array_like<T>) {
69 #pragma unroll
70 // neon intrinsics need sizeof.
71 for (size_t i = 0; i < sizeof(in) / sizeof(in[0]); ++i) {
72 out[i] = in[i];
73 }
74 } else { /* constexpr */
75 const auto& [inv] = in;
76 #pragma unroll
77 for (size_t i = 0; i < T::size(); ++i) {
78 out[i] = inv[i];
79 }
80 }
81 } else { /* constexpr */
82 auto& [outv] = out;
83 if constexpr (is_array_like<T>) {
84 #pragma unroll
85 // neon intrinsics need sizeof.
86 for (size_t i = 0; i < sizeof(in) / sizeof(in[0]); ++i) {
87 outv[i] = in[i];
88 }
89 } else { /* constexpr */
90 const auto& [inv] = in;
91 #pragma unroll
92 for (size_t i = 0; i < T::size(); ++i) {
93 outv[i] = inv[i];
94 }
95 }
96 }
97 return out;
98 }
99
100 /*
101 Generalized template functions for the Neon instruction set.
102
103 See here for some general comments from ARM.
104 https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization
105
106 Notes:
107 1) We provide scalar equivalents which are compilable even on non-ARM processors.
108 2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
109 3) NEON double SIMD acceleration is only available on 64 bit architectures.
110 On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.
111
112 We create a generic Neon acceleration to be applied to a composite type.
113
114 The type follows the following compositional rules for simplicity:
115 1) must be a primitive floating point type.
116 2) must be a NEON data type.
117 3) must be a struct with one member, either
118 a) an array of types 1-3.
119 b) a cons-pair struct of 2 possibly different members of types 1-3.
120
121 Examples of possible struct definitions:
122 using alternative_2_t = struct { struct { float a; float b; } s; };
123 using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
124 using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
125 */
126
127 // add a + b
128 template<typename T>
vadd(T a,T b)129 static inline T vadd(T a, T b) {
130 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
131 return a + b;
132
133 #ifdef USE_NEON
134 } else if constexpr (std::is_same_v<T, float32x2_t>) {
135 return vadd_f32(a, b);
136 } else if constexpr (std::is_same_v<T, float32x4_t>) {
137 return vaddq_f32(a, b);
138 #if defined(__aarch64__)
139 } else if constexpr (std::is_same_v<T, float64x2_t>) {
140 return vaddq_f64(a, b);
141 #endif
142 #endif // USE_NEON
143
144 } else /* constexpr */ {
145 T ret;
146 auto &[retval] = ret; // single-member struct
147 const auto &[aval] = a;
148 const auto &[bval] = b;
149 if constexpr (std::is_array_v<decltype(retval)>) {
150 #pragma unroll
151 for (size_t i = 0; i < std::size(aval); ++i) {
152 retval[i] = vadd(aval[i], bval[i]);
153 }
154 return ret;
155 } else /* constexpr */ {
156 auto &[r1, r2] = retval;
157 const auto &[a1, a2] = aval;
158 const auto &[b1, b2] = bval;
159 r1 = vadd(a1, b1);
160 r2 = vadd(a2, b2);
161 return ret;
162 }
163 }
164 }
165
166 // add internally
167 template<typename T>
vaddv(const T & a)168 inline auto vaddv(const T& a) {
169 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
170 return a;
171
172 #ifdef USE_NEON
173 } else if constexpr (std::is_same_v<T, float32x2_t>) {
174 return vaddv_f32(a);
175 #if defined(__aarch64__)
176 } else if constexpr (std::is_same_v<T, float32x4_t>) {
177 return vaddvq_f32(a);
178 } else if constexpr (std::is_same_v<T, float64x2_t>) {
179 return vaddvq_f64(a);
180 #endif
181 #endif // USE_NEON
182 } else if constexpr (is_array_like<T>) {
183 using ret_t = std::decay_t<decltype(a[0])>;
184
185 ret_t ret{};
186 // array_like is not the same as an array, so we use sizeof here
187 // to handle neon instrinsics.
188 #pragma unroll
189 for (size_t i = 0; i < sizeof(a) / sizeof(a[0]); ++i) {
190 ret += a[i];
191 }
192 return ret;
193 } else /* constexpr */ {
194 const auto &[aval] = a;
195 using ret_t = std::decay_t<decltype(aval[0])>;
196 ret_t ret{};
197
198 #pragma unroll
199 for (size_t i = 0; i < std::size(aval); ++i) {
200 ret += aval[i];
201 }
202 return ret;
203 }
204 }
205
206 // duplicate float into all elements.
207 template<typename T, typename F>
vdupn(F f)208 static inline T vdupn(F f) {
209 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
210 return f;
211
212 #ifdef USE_NEON
213 } else if constexpr (std::is_same_v<T, float32x2_t>) {
214 return vdup_n_f32(f);
215 } else if constexpr (std::is_same_v<T, float32x4_t>) {
216 return vdupq_n_f32(f);
217 #if defined(__aarch64__)
218 } else if constexpr (std::is_same_v<T, float64x2_t>) {
219 return vdupq_n_f64(f);
220 #endif
221 #endif // USE_NEON
222
223 } else /* constexpr */ {
224 T ret;
225 auto &[retval] = ret; // single-member struct
226 if constexpr (std::is_array_v<decltype(retval)>) {
227 #pragma unroll
228 for (auto& val : retval) {
229 val = vdupn<std::decay_t<decltype(val)>>(f);
230 }
231 return ret;
232 } else /* constexpr */ {
233 auto &[r1, r2] = retval;
234 using r1_type = std::decay_t<decltype(r1)>;
235 using r2_type = std::decay_t<decltype(r2)>;
236 r1 = vdupn<r1_type>(f);
237 r2 = vdupn<r2_type>(f);
238 return ret;
239 }
240 }
241 }
242
243 // load from float pointer.
244 template<typename T, typename F>
vld1(const F * f)245 static inline T vld1(const F *f) {
246 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
247 return *f;
248
249 #ifdef USE_NEON
250 } else if constexpr (std::is_same_v<T, float32x2_t>) {
251 return vld1_f32(f);
252 } else if constexpr (std::is_same_v<T, float32x4_t>) {
253 return vld1q_f32(f);
254 #if defined(__aarch64__)
255 } else if constexpr (std::is_same_v<T, float64x2_t>) {
256 return vld1q_f64(f);
257 #endif
258 #endif // USE_NEON
259
260 } else /* constexpr */ {
261 T ret;
262 auto &[retval] = ret; // single-member struct
263 if constexpr (std::is_array_v<decltype(retval)>) {
264 using element_type = std::decay_t<decltype(retval[0])>;
265 constexpr size_t subelements = sizeof(element_type) / sizeof(F);
266 #pragma unroll
267 for (size_t i = 0; i < std::size(retval); ++i) {
268 retval[i] = vld1<element_type>(f);
269 f += subelements;
270 }
271 return ret;
272 } else /* constexpr */ {
273 auto &[r1, r2] = retval;
274 using r1_type = std::decay_t<decltype(r1)>;
275 using r2_type = std::decay_t<decltype(r2)>;
276 r1 = vld1<r1_type>(f);
277 f += sizeof(r1) / sizeof(F);
278 r2 = vld1<r2_type>(f);
279 return ret;
280 }
281 }
282 }
283
284 /**
285 * Returns c as follows:
286 * c_i = a_i * b_i if a and b are the same vector type or
287 * c_i = a_i * b if a is a vector and b is scalar or
288 * c_i = a * b_i if a is scalar and b is a vector.
289 */
290 template<typename T, typename S, typename F>
vmla(T a,S b,F c)291 static inline T vmla(T a, S b, F c) {
292 // Both types T and S are non-primitive and they are not equal. T == S handled below.
293 (void) a;
294 (void) b;
295 (void) c;
296 static_assert(dependent_false_v<T>);
297 }
298
299 template<typename T, typename F>
vmla(T a,T b,F c)300 static inline T vmla(T a, T b, F c) {
301 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
302 if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
303 return a + b * c;
304 } else {
305 static_assert(dependent_false_v<T>);
306 }
307 } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
308 // handle the lane variant
309 #ifdef USE_NEON
310 if constexpr (std::is_same_v<T, float32x2_t>) {
311 return vmla_n_f32(a, b, c);
312 } else if constexpr (std::is_same_v<T, float32x4_t>) {
313 return vmlaq_n_f32(a, b,c);
314 #if defined(__aarch64__)
315 } else if constexpr (std::is_same_v<T, float64x2_t>) {
316 return vmlaq_n_f64(a, b);
317 #endif
318 } else
319 #endif // USE_NEON
320 {
321 T ret;
322 auto &[retval] = ret; // single-member struct
323 const auto &[aval] = a;
324 const auto &[bval] = b;
325 if constexpr (std::is_array_v<decltype(retval)>) {
326 #pragma unroll
327 for (size_t i = 0; i < std::size(aval); ++i) {
328 retval[i] = vmla(aval[i], bval[i], c);
329 }
330 return ret;
331 } else /* constexpr */ {
332 auto &[r1, r2] = retval;
333 const auto &[a1, a2] = aval;
334 const auto &[b1, b2] = bval;
335 r1 = vmla(a1, b1, c);
336 r2 = vmla(a2, b2, c);
337 return ret;
338 }
339 }
340 } else {
341 // Both types T and F are non-primitive and they are not equal.
342 static_assert(dependent_false_v<T>);
343 }
344 }
345
346 template<typename T, typename F>
vmla(T a,F b,T c)347 static inline T vmla(T a, F b, T c) {
348 return vmla(a, c, b);
349 }
350
351 // fused multiply-add a + b * c
352 template<typename T>
vmla(const T & a,const T & b,const T & c)353 inline T vmla(const T& a, const T& b, const T& c) {
354 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
355 return a + b * c;
356
357 #ifdef USE_NEON
358 } else if constexpr (std::is_same_v<T, float32x2_t>) {
359 return vmla_f32(a, b, c);
360 } else if constexpr (std::is_same_v<T, float32x4_t>) {
361 return vmlaq_f32(a, b, c);
362 #if defined(__aarch64__)
363 } else if constexpr (std::is_same_v<T, float64x2_t>) {
364 return vmlaq_f64(a, b, c);
365 #endif
366 #endif // USE_NEON
367
368 } else /* constexpr */ {
369 T ret;
370 auto &[retval] = ret; // single-member struct
371 const auto &[aval] = a;
372 const auto &[bval] = b;
373 const auto &[cval] = c;
374 if constexpr (std::is_array_v<decltype(retval)>) {
375 #pragma unroll
376 for (size_t i = 0; i < std::size(aval); ++i) {
377 retval[i] = vmla(aval[i], bval[i], cval[i]);
378 }
379 return ret;
380 } else /* constexpr */ {
381 auto &[r1, r2] = retval;
382 const auto &[a1, a2] = aval;
383 const auto &[b1, b2] = bval;
384 const auto &[c1, c2] = cval;
385 r1 = vmla(a1, b1, c1);
386 r2 = vmla(a2, b2, c2);
387 return ret;
388 }
389 }
390 }
391
392 /**
393 * Returns c as follows:
394 * c_i = a_i * b_i if a and b are the same vector type or
395 * c_i = a_i * b if a is a vector and b is scalar or
396 * c_i = a * b_i if a is scalar and b is a vector.
397 */
398 template<typename T, typename F>
vmul(T a,F b)399 static inline auto vmul(T a, F b) {
400 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
401 if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
402 return a * b;
403 } else /* constexpr */ {
404 return vmul(b, a); // we prefer T to be the vector/struct form.
405 }
406 } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
407 // handle the lane variant
408 #ifdef USE_NEON
409 if constexpr (std::is_same_v<T, float32x2_t>) {
410 return vmul_n_f32(a, b);
411 } else if constexpr (std::is_same_v<T, float32x4_t>) {
412 return vmulq_n_f32(a, b);
413 #if defined(__aarch64__)
414 } else if constexpr (std::is_same_v<T, float64x2_t>) {
415 return vmulq_n_f64(a, b);
416 #endif
417 } else
418 #endif // USE_NEON
419 {
420 T ret;
421 auto &[retval] = ret; // single-member struct
422 const auto &[aval] = a;
423 if constexpr (std::is_array_v<decltype(retval)>) {
424 #pragma unroll
425 for (size_t i = 0; i < std::size(aval); ++i) {
426 retval[i] = vmul(aval[i], b);
427 }
428 return ret;
429 } else /* constexpr */ {
430 auto &[r1, r2] = retval;
431 const auto &[a1, a2] = aval;
432 r1 = vmul(a1, b);
433 r2 = vmul(a2, b);
434 return ret;
435 }
436 }
437 } else {
438 // Both types T and F are non-primitive and they are not equal.
439 static_assert(dependent_false_v<T>);
440 }
441 }
442
443 template<typename T>
vmul(T a,T b)444 static inline T vmul(T a, T b) {
445 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
446 return a * b;
447
448 #ifdef USE_NEON
449 } else if constexpr (std::is_same_v<T, float32x2_t>) {
450 return vmul_f32(a, b);
451 } else if constexpr (std::is_same_v<T, float32x4_t>) {
452 return vmulq_f32(a, b);
453 #if defined(__aarch64__)
454 } else if constexpr (std::is_same_v<T, float64x2_t>) {
455 return vmulq_f64(a, b);
456 #endif
457 #endif // USE_NEON
458
459 } else /* constexpr */ {
460 T ret;
461 auto &[retval] = ret; // single-member struct
462 const auto &[aval] = a;
463 const auto &[bval] = b;
464 if constexpr (std::is_array_v<decltype(retval)>) {
465 #pragma unroll
466 for (size_t i = 0; i < std::size(aval); ++i) {
467 retval[i] = vmul(aval[i], bval[i]);
468 }
469 return ret;
470 } else /* constexpr */ {
471 auto &[r1, r2] = retval;
472 const auto &[a1, a2] = aval;
473 const auto &[b1, b2] = bval;
474 r1 = vmul(a1, b1);
475 r2 = vmul(a2, b2);
476 return ret;
477 }
478 }
479 }
480
481 // negate
482 template<typename T>
vneg(T f)483 static inline T vneg(T f) {
484 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
485 return -f;
486
487 #ifdef USE_NEON
488 } else if constexpr (std::is_same_v<T, float32x2_t>) {
489 return vneg_f32(f);
490 } else if constexpr (std::is_same_v<T, float32x4_t>) {
491 return vnegq_f32(f);
492 #if defined(__aarch64__)
493 } else if constexpr (std::is_same_v<T, float64x2_t>) {
494 return vnegq_f64(f);
495 #endif
496 #endif // USE_NEON
497
498 } else /* constexpr */ {
499 T ret;
500 auto &[retval] = ret; // single-member struct
501 const auto &[fval] = f;
502 if constexpr (std::is_array_v<decltype(retval)>) {
503 #pragma unroll
504 for (size_t i = 0; i < std::size(fval); ++i) {
505 retval[i] = vneg(fval[i]);
506 }
507 return ret;
508 } else /* constexpr */ {
509 auto &[r1, r2] = retval;
510 const auto &[f1, f2] = fval;
511 r1 = vneg(f1);
512 r2 = vneg(f2);
513 return ret;
514 }
515 }
516 }
517
518 // store to float pointer.
519 template<typename T, typename F>
vst1(F * f,T a)520 static inline void vst1(F *f, T a) {
521 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
522 *f = a;
523
524 #ifdef USE_NEON
525 } else if constexpr (std::is_same_v<T, float32x2_t>) {
526 return vst1_f32(f, a);
527 } else if constexpr (std::is_same_v<T, float32x4_t>) {
528 return vst1q_f32(f, a);
529 #if defined(__aarch64__)
530 } else if constexpr (std::is_same_v<T, float64x2_t>) {
531 return vst1q_f64(f, a);
532 #endif
533 #endif // USE_NEON
534
535 } else /* constexpr */ {
536 const auto &[aval] = a;
537 if constexpr (std::is_array_v<decltype(aval)>) {
538 constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
539 #pragma unroll
540 for (size_t i = 0; i < std::size(aval); ++i) {
541 vst1(f, aval[i]);
542 f += subelements;
543 }
544 } else /* constexpr */ {
545 const auto &[a1, a2] = aval;
546 vst1(f, a1);
547 f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
548 vst1(f, a2);
549 }
550 }
551 }
552
553 // subtract a - b
554 template<typename T>
vsub(T a,T b)555 static inline T vsub(T a, T b) {
556 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
557 return a - b;
558
559 #ifdef USE_NEON
560 } else if constexpr (std::is_same_v<T, float32x2_t>) {
561 return vsub_f32(a, b);
562 } else if constexpr (std::is_same_v<T, float32x4_t>) {
563 return vsubq_f32(a, b);
564 #if defined(__aarch64__)
565 } else if constexpr (std::is_same_v<T, float64x2_t>) {
566 return vsubq_f64(a, b);
567 #endif
568 #endif // USE_NEON
569
570 } else /* constexpr */ {
571 T ret;
572 auto &[retval] = ret; // single-member struct
573 const auto &[aval] = a;
574 const auto &[bval] = b;
575 if constexpr (std::is_array_v<decltype(retval)>) {
576 #pragma unroll
577 for (size_t i = 0; i < std::size(aval); ++i) {
578 retval[i] = vsub(aval[i], bval[i]);
579 }
580 return ret;
581 } else /* constexpr */ {
582 auto &[r1, r2] = retval;
583 const auto &[a1, a2] = aval;
584 const auto &[b1, b2] = bval;
585 r1 = vsub(a1, b1);
586 r2 = vsub(a2, b2);
587 return ret;
588 }
589 }
590 }
591
592 } // namespace android::audio_utils::intrinsics
593
594 #pragma pop_macro("USE_NEON")
595
596 #endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
597