| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cmath> | ||
| 6 | |||
| 7 | #include "kleidicv/arithmetics/exp_constants.h" | ||
| 8 | #include "kleidicv/kleidicv.h" | ||
| 9 | #include "kleidicv/neon.h" | ||
| 10 | |||
| 11 | namespace kleidicv::neon { | ||
| 12 | |||
| 13 | template <typename ScalarType> | ||
| 14 | class Exp; | ||
| 15 | |||
| 16 | template <> | ||
| 17 | class Exp<float> final : public UnrollOnce { | ||
| 18 | public: | ||
| 19 | using VecTraits = neon::VecTraits<float>; | ||
| 20 | using VectorType = typename VecTraits::VectorType; | ||
| 21 | |||
| 22 | 2903 | VectorType vector_path(VectorType src) { | |
| 23 | 2903 | float32x4_t n, r, scale, poly, z; | |
| 24 | 2903 | uint32x4_t cmp, e; | |
| 25 | |||
| 26 | /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] | ||
| 27 | x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ | ||
| 28 | 2903 | z = vfmaq_f32(vdupq_n(exp_f32::kShift), src, vdupq_n(exp_f32::kInvLn2)); | |
| 29 | 2903 | n = z - vdupq_n(exp_f32::kShift); | |
| 30 | 2903 | r = vfmaq_f32(src, n, vdupq_n(-exp_f32::kLn2Hi)); | |
| 31 | 2903 | r = vfmaq_f32(r, n, vdupq_n(-exp_f32::kLn2Lo)); | |
| 32 | 2903 | e = vreinterpretq_u32_f32(z) << 23; | |
| 33 | 2903 | scale = vreinterpretq_f32_u32(e + vdupq_n(0x3f800000)); | |
| 34 | 2903 | cmp = vcagtq_f32(n, vdupq_n(126.0F)); | |
| 35 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[1]), vdupq_n(exp_f32::kPoly[0]), r); | |
| 36 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[2]), poly, r); | |
| 37 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[3]), poly, r); | |
| 38 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[4]), poly, r); | |
| 39 | 2903 | poly = vfmaq_f32(vdupq_n(1.0F), poly, r); | |
| 40 | 2903 | poly = vfmaq_f32(vdupq_n(1.0F), poly, r); | |
| 41 |
2/2✓ Branch 0 taken 2601 times.
✓ Branch 1 taken 302 times.
|
2903 | if (KLEIDICV_UNLIKELY(v_any_u32(cmp))) { |
| 42 | 2601 | return specialcase(poly, n, e); | |
| 43 | } | ||
| 44 | 302 | return scale * poly; | |
| 45 | 2903 | } | |
| 46 | |||
| 47 | 80 | float scalar_path(float src) { return expf(src); } | |
| 48 | |||
| 49 | private: | ||
| 50 | 2903 | static int v_any_u32(uint32x4_t x) { | |
| 51 | /* assume elements in x are either 0 or -1u. */ | ||
| 52 | 2903 | return vpaddd_u64(vreinterpretq_u64_u32(x)) != 0; | |
| 53 | } | ||
| 54 | |||
| 55 | 2601 | static float32x4_t specialcase(float32x4_t poly, float32x4_t n, | |
| 56 | uint32x4_t e) { | ||
| 57 | /* 2^n may overflow, break it up into s1*s2. */ | ||
| 58 | 2601 | uint32x4_t b = (n <= vdupq_n(0.0F)) & vdupq_n(0x83000000); | |
| 59 | 2601 | float32x4_t s1 = vreinterpretq_f32_u32(vdupq_n(0x7f000000) + b); | |
| 60 | 2601 | float32x4_t s2 = vreinterpretq_f32_u32(e - b); | |
| 61 | 2601 | uint32x4_t cmp = vcagtq_f32(n, vdupq_n(192.0F)); | |
| 62 | 2601 | float32x4_t r1 = s1 * s1; | |
| 63 | 2601 | float32x4_t r0 = (poly * s1) * s2; | |
| 64 | 7803 | return vreinterpretq_f32_u32((cmp & vreinterpretq_u32_f32(r1)) | | |
| 65 | 2601 | (~cmp & vreinterpretq_u32_f32(r0))); | |
| 66 | 2601 | } | |
| 67 | }; // end of class Exp<float> | ||
| 68 | |||
| 69 | template <typename T> | ||
| 70 | 81 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp(const T* src, size_t src_stride, | |
| 71 | T* dst, size_t dst_stride, | ||
| 72 | size_t width, size_t height) { | ||
| 73 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 79 times.
|
81 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 74 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 77 times.
|
79 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 75 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 75 times.
|
77 | CHECK_IMAGE_SIZE(width, height); |
| 76 | |||
| 77 | 75 | Exp<T> operation; | |
| 78 | 75 | Rectangle rect{width, height}; | |
| 79 | 75 | Rows<const T> src_rows{src, src_stride}; | |
| 80 | 75 | Rows<T> dst_rows{dst, dst_stride}; | |
| 81 | 75 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 82 | 75 | return KLEIDICV_OK; | |
| 83 | 81 | } | |
| 84 | |||
| 85 | #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ | ||
| 86 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp<type>( \ | ||
| 87 | const type* src, size_t src_stride, type* dst, size_t dst_stride, \ | ||
| 88 | size_t width, size_t height) | ||
| 89 | |||
| 90 | KLEIDICV_INSTANTIATE_TEMPLATE(float); | ||
| 91 | |||
| 92 | } // namespace kleidicv::neon | ||
| 93 |