1 /*
2  * Copyright 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
18 #define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
19 
20 #include <array>  // std::size
21 #include <type_traits>
22 
23 /*
24   The intrinsics utility library contain helper functions for wide width DSP support.
25   We use templated types to allow testing from scalar to vector values.
26 
27   See the Eigen project for general abstracted linear algebra acceleration.
28   http://eigen.tuxfamily.org/
29 */
30 
31 // We conditionally include neon optimizations for ARM devices
32 #pragma push_macro("USE_NEON")
33 #undef USE_NEON
34 
35 #if defined(__ARM_NEON__) || defined(__aarch64__)
36 #include <arm_neon.h>
37 #define USE_NEON
38 #endif
39 
40 namespace android::audio_utils::intrinsics {
41 
42 // For static assert(false) we need a template version to avoid early failure.
43 // See: https://stackoverflow.com/questions/51523965/template-dependent-false
44 template <typename T>
45 inline constexpr bool dependent_false_v = false;
46 
47 // Type of array embedded in a struct that is usable in the Neon template functions below.
48 // This type must satisfy std::is_array_v<>.
49 template<typename T, size_t N>
50 struct internal_array_t {
51     T v[N];
sizeinternal_array_t52     static constexpr size_t size() { return N; }
53 };
54 
55 // Detect if the value is directly addressable as an array.
56 // This is more advanced than std::is_array and works with neon intrinsics.
57 template<typename T>
requires(T a)58 concept is_array_like = requires(T a) {
59     a[0];  // can index first element
60 };
61 
62 // Vector convert between type T to type S.
63 template <typename S, typename T>
vconvert(const T & in)64 inline S vconvert(const T& in) {
65     S out;
66 
67     if constexpr (is_array_like<S>) {
68         if constexpr (is_array_like<T>) {
69 #pragma unroll
70             // neon intrinsics need sizeof.
71             for (size_t i = 0; i < sizeof(in) / sizeof(in[0]); ++i) {
72                 out[i] = in[i];
73             }
74         } else { /* constexpr */
75             const auto& [inv] = in;
76 #pragma unroll
77             for (size_t i = 0; i < T::size(); ++i) {
78                 out[i] = inv[i];
79             }
80         }
81     } else { /* constexpr */
82         auto& [outv] = out;
83         if constexpr (is_array_like<T>) {
84 #pragma unroll
85             // neon intrinsics need sizeof.
86             for (size_t i = 0; i < sizeof(in) / sizeof(in[0]); ++i) {
87                 outv[i] = in[i];
88             }
89         } else { /* constexpr */
90             const auto& [inv] = in;
91 #pragma unroll
92             for (size_t i = 0; i < T::size(); ++i) {
93                 outv[i] = inv[i];
94             }
95         }
96     }
97     return out;
98 }
99 
100 /*
101   Generalized template functions for the Neon instruction set.
102 
103   See here for some general comments from ARM.
104   https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization
105 
106   Notes:
107   1) We provide scalar equivalents which are compilable even on non-ARM processors.
108   2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
109   3) NEON double SIMD acceleration is only available on 64 bit architectures.
110      On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.
111 
112   We create a generic Neon acceleration to be applied to a composite type.
113 
114   The type follows the following compositional rules for simplicity:
115       1) must be a primitive floating point type.
116       2) must be a NEON data type.
117       3) must be a struct with one member, either
118            a) an array of types 1-3.
119            b) a cons-pair struct of 2 possibly different members of types 1-3.
120 
121   Examples of possible struct definitions:
122   using alternative_2_t = struct { struct { float a; float b; } s; };
123   using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
124   using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
125 */
126 
127 // add a + b
128 template<typename T>
vadd(T a,T b)129 static inline T vadd(T a, T b) {
130     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
131         return a + b;
132 
133 #ifdef USE_NEON
134     } else if constexpr (std::is_same_v<T, float32x2_t>) {
135         return vadd_f32(a, b);
136     } else if constexpr (std::is_same_v<T, float32x4_t>) {
137         return vaddq_f32(a, b);
138 #if defined(__aarch64__)
139     } else if constexpr (std::is_same_v<T, float64x2_t>) {
140         return vaddq_f64(a, b);
141 #endif
142 #endif // USE_NEON
143 
144     } else /* constexpr */ {
145         T ret;
146         auto &[retval] = ret;  // single-member struct
147         const auto &[aval] = a;
148         const auto &[bval] = b;
149         if constexpr (std::is_array_v<decltype(retval)>) {
150 #pragma unroll
151             for (size_t i = 0; i < std::size(aval); ++i) {
152                 retval[i] = vadd(aval[i], bval[i]);
153             }
154             return ret;
155         } else /* constexpr */ {
156              auto &[r1, r2] = retval;
157              const auto &[a1, a2] = aval;
158              const auto &[b1, b2] = bval;
159              r1 = vadd(a1, b1);
160              r2 = vadd(a2, b2);
161              return ret;
162         }
163     }
164 }
165 
166 // add internally
167 template<typename T>
vaddv(const T & a)168 inline auto vaddv(const T& a) {
169     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
170         return a;
171 
172 #ifdef USE_NEON
173     } else if constexpr (std::is_same_v<T, float32x2_t>) {
174         return vaddv_f32(a);
175 #if defined(__aarch64__)
176     } else if constexpr (std::is_same_v<T, float32x4_t>) {
177         return vaddvq_f32(a);
178     } else if constexpr (std::is_same_v<T, float64x2_t>) {
179         return vaddvq_f64(a);
180 #endif
181 #endif // USE_NEON
182     } else if constexpr (is_array_like<T>) {
183         using ret_t = std::decay_t<decltype(a[0])>;
184 
185         ret_t ret{};
186         // array_like is not the same as an array, so we use sizeof here
187         // to handle neon instrinsics.
188 #pragma unroll
189         for (size_t i = 0; i < sizeof(a) / sizeof(a[0]); ++i) {
190             ret += a[i];
191         }
192         return ret;
193     } else /* constexpr */ {
194         const auto &[aval] = a;
195         using ret_t = std::decay_t<decltype(aval[0])>;
196         ret_t ret{};
197 
198 #pragma unroll
199         for (size_t i = 0; i < std::size(aval); ++i) {
200             ret += aval[i];
201         }
202         return ret;
203     }
204 }
205 
206 // duplicate float into all elements.
207 template<typename T, typename F>
vdupn(F f)208 static inline T vdupn(F f) {
209     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
210         return f;
211 
212 #ifdef USE_NEON
213     } else if constexpr (std::is_same_v<T, float32x2_t>) {
214         return vdup_n_f32(f);
215     } else if constexpr (std::is_same_v<T, float32x4_t>) {
216         return vdupq_n_f32(f);
217 #if defined(__aarch64__)
218     } else if constexpr (std::is_same_v<T, float64x2_t>) {
219         return vdupq_n_f64(f);
220 #endif
221 #endif // USE_NEON
222 
223     } else /* constexpr */ {
224         T ret;
225         auto &[retval] = ret;  // single-member struct
226         if constexpr (std::is_array_v<decltype(retval)>) {
227 #pragma unroll
228             for (auto& val : retval) {
229                 val = vdupn<std::decay_t<decltype(val)>>(f);
230             }
231             return ret;
232         } else /* constexpr */ {
233              auto &[r1, r2] = retval;
234              using r1_type = std::decay_t<decltype(r1)>;
235              using r2_type = std::decay_t<decltype(r2)>;
236              r1 = vdupn<r1_type>(f);
237              r2 = vdupn<r2_type>(f);
238              return ret;
239         }
240     }
241 }
242 
243 // load from float pointer.
244 template<typename T, typename F>
vld1(const F * f)245 static inline T vld1(const F *f) {
246     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
247         return *f;
248 
249 #ifdef USE_NEON
250     } else if constexpr (std::is_same_v<T, float32x2_t>) {
251         return vld1_f32(f);
252     } else if constexpr (std::is_same_v<T, float32x4_t>) {
253         return vld1q_f32(f);
254 #if defined(__aarch64__)
255     } else if constexpr (std::is_same_v<T, float64x2_t>) {
256         return vld1q_f64(f);
257 #endif
258 #endif // USE_NEON
259 
260     } else /* constexpr */ {
261         T ret;
262         auto &[retval] = ret;  // single-member struct
263         if constexpr (std::is_array_v<decltype(retval)>) {
264             using element_type = std::decay_t<decltype(retval[0])>;
265             constexpr size_t subelements = sizeof(element_type) / sizeof(F);
266 #pragma unroll
267             for (size_t i = 0; i < std::size(retval); ++i) {
268                 retval[i] = vld1<element_type>(f);
269                 f += subelements;
270             }
271             return ret;
272         } else /* constexpr */ {
273              auto &[r1, r2] = retval;
274              using r1_type = std::decay_t<decltype(r1)>;
275              using r2_type = std::decay_t<decltype(r2)>;
276              r1 = vld1<r1_type>(f);
277              f += sizeof(r1) / sizeof(F);
278              r2 = vld1<r2_type>(f);
279              return ret;
280         }
281     }
282 }
283 
284 /**
285  * Returns c as follows:
286  * c_i = a_i * b_i if a and b are the same vector type or
287  * c_i = a_i * b if a is a vector and b is scalar or
288  * c_i = a * b_i if a is scalar and b is a vector.
289  */
290 template<typename T, typename S, typename F>
vmla(T a,S b,F c)291 static inline T vmla(T a, S b, F c) {
292     // Both types T and S are non-primitive and they are not equal.  T == S handled below.
293     (void) a;
294     (void) b;
295     (void) c;
296     static_assert(dependent_false_v<T>);
297 }
298 
299 template<typename T, typename F>
vmla(T a,T b,F c)300 static inline T vmla(T a, T b, F c) {
301     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
302         if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
303             return a + b * c;
304         } else {
305             static_assert(dependent_false_v<T>);
306         }
307     } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
308         // handle the lane variant
309 #ifdef USE_NEON
310         if constexpr (std::is_same_v<T, float32x2_t>) {
311             return vmla_n_f32(a, b, c);
312         } else if constexpr (std::is_same_v<T, float32x4_t>) {
313             return vmlaq_n_f32(a, b,c);
314 #if defined(__aarch64__)
315         } else if constexpr (std::is_same_v<T, float64x2_t>) {
316             return vmlaq_n_f64(a, b);
317 #endif
318         } else
319 #endif // USE_NEON
320         {
321         T ret;
322         auto &[retval] = ret;  // single-member struct
323         const auto &[aval] = a;
324         const auto &[bval] = b;
325         if constexpr (std::is_array_v<decltype(retval)>) {
326 #pragma unroll
327             for (size_t i = 0; i < std::size(aval); ++i) {
328                 retval[i] = vmla(aval[i], bval[i], c);
329             }
330             return ret;
331         } else /* constexpr */ {
332              auto &[r1, r2] = retval;
333              const auto &[a1, a2] = aval;
334              const auto &[b1, b2] = bval;
335              r1 = vmla(a1, b1, c);
336              r2 = vmla(a2, b2, c);
337              return ret;
338         }
339         }
340     } else {
341         // Both types T and F are non-primitive and they are not equal.
342         static_assert(dependent_false_v<T>);
343     }
344 }
345 
346 template<typename T, typename F>
vmla(T a,F b,T c)347 static inline T vmla(T a, F b, T c) {
348     return vmla(a, c, b);
349 }
350 
351 // fused multiply-add a + b * c
352 template<typename T>
vmla(const T & a,const T & b,const T & c)353 inline T vmla(const T& a, const T& b, const T& c) {
354     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
355         return a + b * c;
356 
357 #ifdef USE_NEON
358     } else if constexpr (std::is_same_v<T, float32x2_t>) {
359         return vmla_f32(a, b, c);
360     } else if constexpr (std::is_same_v<T, float32x4_t>) {
361         return vmlaq_f32(a, b, c);
362 #if defined(__aarch64__)
363     } else if constexpr (std::is_same_v<T, float64x2_t>) {
364         return vmlaq_f64(a, b, c);
365 #endif
366 #endif // USE_NEON
367 
368     } else /* constexpr */ {
369         T ret;
370         auto &[retval] = ret;  // single-member struct
371         const auto &[aval] = a;
372         const auto &[bval] = b;
373         const auto &[cval] = c;
374         if constexpr (std::is_array_v<decltype(retval)>) {
375 #pragma unroll
376             for (size_t i = 0; i < std::size(aval); ++i) {
377                 retval[i] = vmla(aval[i], bval[i], cval[i]);
378             }
379             return ret;
380         } else /* constexpr */ {
381              auto &[r1, r2] = retval;
382              const auto &[a1, a2] = aval;
383              const auto &[b1, b2] = bval;
384              const auto &[c1, c2] = cval;
385              r1 = vmla(a1, b1, c1);
386              r2 = vmla(a2, b2, c2);
387              return ret;
388         }
389     }
390 }
391 
392 /**
393  * Returns c as follows:
394  * c_i = a_i * b_i if a and b are the same vector type or
395  * c_i = a_i * b if a is a vector and b is scalar or
396  * c_i = a * b_i if a is scalar and b is a vector.
397  */
398 template<typename T, typename F>
vmul(T a,F b)399 static inline auto vmul(T a, F b) {
400     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
401         if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
402             return a * b;
403         } else /* constexpr */ {
404             return vmul(b, a); // we prefer T to be the vector/struct form.
405         }
406     } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
407         // handle the lane variant
408 #ifdef USE_NEON
409         if constexpr (std::is_same_v<T, float32x2_t>) {
410             return vmul_n_f32(a, b);
411         } else if constexpr (std::is_same_v<T, float32x4_t>) {
412             return vmulq_n_f32(a, b);
413 #if defined(__aarch64__)
414         } else if constexpr (std::is_same_v<T, float64x2_t>) {
415             return vmulq_n_f64(a, b);
416 #endif
417         } else
418 #endif // USE_NEON
419         {
420         T ret;
421         auto &[retval] = ret;  // single-member struct
422         const auto &[aval] = a;
423         if constexpr (std::is_array_v<decltype(retval)>) {
424 #pragma unroll
425             for (size_t i = 0; i < std::size(aval); ++i) {
426                 retval[i] = vmul(aval[i], b);
427             }
428             return ret;
429         } else /* constexpr */ {
430              auto &[r1, r2] = retval;
431              const auto &[a1, a2] = aval;
432              r1 = vmul(a1, b);
433              r2 = vmul(a2, b);
434              return ret;
435         }
436         }
437     } else {
438         // Both types T and F are non-primitive and they are not equal.
439         static_assert(dependent_false_v<T>);
440     }
441 }
442 
443 template<typename T>
vmul(T a,T b)444 static inline T vmul(T a, T b) {
445     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
446         return a * b;
447 
448 #ifdef USE_NEON
449     } else if constexpr (std::is_same_v<T, float32x2_t>) {
450         return vmul_f32(a, b);
451     } else if constexpr (std::is_same_v<T, float32x4_t>) {
452         return vmulq_f32(a, b);
453 #if defined(__aarch64__)
454     } else if constexpr (std::is_same_v<T, float64x2_t>) {
455         return vmulq_f64(a, b);
456 #endif
457 #endif // USE_NEON
458 
459     } else /* constexpr */ {
460         T ret;
461         auto &[retval] = ret;  // single-member struct
462         const auto &[aval] = a;
463         const auto &[bval] = b;
464         if constexpr (std::is_array_v<decltype(retval)>) {
465 #pragma unroll
466             for (size_t i = 0; i < std::size(aval); ++i) {
467                 retval[i] = vmul(aval[i], bval[i]);
468             }
469             return ret;
470         } else /* constexpr */ {
471              auto &[r1, r2] = retval;
472              const auto &[a1, a2] = aval;
473              const auto &[b1, b2] = bval;
474              r1 = vmul(a1, b1);
475              r2 = vmul(a2, b2);
476              return ret;
477         }
478     }
479 }
480 
481 // negate
482 template<typename T>
vneg(T f)483 static inline T vneg(T f) {
484     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
485         return -f;
486 
487 #ifdef USE_NEON
488     } else if constexpr (std::is_same_v<T, float32x2_t>) {
489         return vneg_f32(f);
490     } else if constexpr (std::is_same_v<T, float32x4_t>) {
491         return vnegq_f32(f);
492 #if defined(__aarch64__)
493     } else if constexpr (std::is_same_v<T, float64x2_t>) {
494         return vnegq_f64(f);
495 #endif
496 #endif // USE_NEON
497 
498     } else /* constexpr */ {
499         T ret;
500         auto &[retval] = ret;  // single-member struct
501         const auto &[fval] = f;
502         if constexpr (std::is_array_v<decltype(retval)>) {
503 #pragma unroll
504             for (size_t i = 0; i < std::size(fval); ++i) {
505                 retval[i] = vneg(fval[i]);
506             }
507             return ret;
508         } else /* constexpr */ {
509              auto &[r1, r2] = retval;
510              const auto &[f1, f2] = fval;
511              r1 = vneg(f1);
512              r2 = vneg(f2);
513              return ret;
514         }
515     }
516 }
517 
518 // store to float pointer.
519 template<typename T, typename F>
vst1(F * f,T a)520 static inline void vst1(F *f, T a) {
521     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
522         *f = a;
523 
524 #ifdef USE_NEON
525     } else if constexpr (std::is_same_v<T, float32x2_t>) {
526         return vst1_f32(f, a);
527     } else if constexpr (std::is_same_v<T, float32x4_t>) {
528         return vst1q_f32(f, a);
529 #if defined(__aarch64__)
530     } else if constexpr (std::is_same_v<T, float64x2_t>) {
531         return vst1q_f64(f, a);
532 #endif
533 #endif // USE_NEON
534 
535     } else /* constexpr */ {
536         const auto &[aval] = a;
537         if constexpr (std::is_array_v<decltype(aval)>) {
538             constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
539 #pragma unroll
540             for (size_t i = 0; i < std::size(aval); ++i) {
541                 vst1(f, aval[i]);
542                 f += subelements;
543             }
544         } else /* constexpr */ {
545              const auto &[a1, a2] = aval;
546              vst1(f, a1);
547              f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
548              vst1(f, a2);
549         }
550     }
551 }
552 
553 // subtract a - b
554 template<typename T>
vsub(T a,T b)555 static inline T vsub(T a, T b) {
556     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
557         return a - b;
558 
559 #ifdef USE_NEON
560     } else if constexpr (std::is_same_v<T, float32x2_t>) {
561         return vsub_f32(a, b);
562     } else if constexpr (std::is_same_v<T, float32x4_t>) {
563         return vsubq_f32(a, b);
564 #if defined(__aarch64__)
565     } else if constexpr (std::is_same_v<T, float64x2_t>) {
566         return vsubq_f64(a, b);
567 #endif
568 #endif // USE_NEON
569 
570     } else /* constexpr */ {
571         T ret;
572         auto &[retval] = ret;  // single-member struct
573         const auto &[aval] = a;
574         const auto &[bval] = b;
575         if constexpr (std::is_array_v<decltype(retval)>) {
576 #pragma unroll
577             for (size_t i = 0; i < std::size(aval); ++i) {
578                 retval[i] = vsub(aval[i], bval[i]);
579             }
580             return ret;
581         } else /* constexpr */ {
582              auto &[r1, r2] = retval;
583              const auto &[a1, a2] = aval;
584              const auto &[b1, b2] = bval;
585              r1 = vsub(a1, b1);
586              r2 = vsub(a2, b2);
587              return ret;
588         }
589     }
590 }
591 
592 } // namespace android::audio_utils::intrinsics
593 
594 #pragma pop_macro("USE_NEON")
595 
596 #endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
597