Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cmath> | ||
6 | |||
7 | #include "kleidicv/arithmetics/exp_constants.h" | ||
8 | #include "kleidicv/kleidicv.h" | ||
9 | #include "kleidicv/neon.h" | ||
10 | |||
11 | namespace kleidicv::neon { | ||
12 | |||
13 | template <typename ScalarType> | ||
14 | class Exp; | ||
15 | |||
16 | template <> | ||
17 | class Exp<float> final : public UnrollOnce { | ||
18 | public: | ||
19 | using VecTraits = neon::VecTraits<float>; | ||
20 | using VectorType = typename VecTraits::VectorType; | ||
21 | |||
22 | 2903 | VectorType vector_path(VectorType src) { | |
23 | 2903 | float32x4_t n, r, scale, poly, z; | |
24 | 2903 | uint32x4_t cmp, e; | |
25 | |||
26 | /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] | ||
27 | x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ | ||
28 | 2903 | z = vfmaq_f32(vdupq_n(exp_f32::kShift), src, vdupq_n(exp_f32::kInvLn2)); | |
29 | 2903 | n = z - vdupq_n(exp_f32::kShift); | |
30 | 2903 | r = vfmaq_f32(src, n, vdupq_n(-exp_f32::kLn2Hi)); | |
31 | 2903 | r = vfmaq_f32(r, n, vdupq_n(-exp_f32::kLn2Lo)); | |
32 | 2903 | e = vreinterpretq_u32_f32(z) << 23; | |
33 | 2903 | scale = vreinterpretq_f32_u32(e + vdupq_n(0x3f800000)); | |
34 | 2903 | cmp = vcagtq_f32(n, vdupq_n(126.0F)); | |
35 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[1]), vdupq_n(exp_f32::kPoly[0]), r); | |
36 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[2]), poly, r); | |
37 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[3]), poly, r); | |
38 | 2903 | poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[4]), poly, r); | |
39 | 2903 | poly = vfmaq_f32(vdupq_n(1.0F), poly, r); | |
40 | 2903 | poly = vfmaq_f32(vdupq_n(1.0F), poly, r); | |
41 |
2/2✓ Branch 0 taken 2624 times.
✓ Branch 1 taken 279 times.
|
2903 | if (KLEIDICV_UNLIKELY(v_any_u32(cmp))) { |
42 | 2624 | return specialcase(poly, n, e); | |
43 | } | ||
44 | 279 | return scale * poly; | |
45 | 2903 | } | |
46 | |||
47 | 80 | float scalar_path(float src) { return expf(src); } | |
48 | |||
49 | private: | ||
50 | 2903 | static int v_any_u32(uint32x4_t x) { | |
51 | /* assume elements in x are either 0 or -1u. */ | ||
52 | 2903 | return vpaddd_u64(vreinterpretq_u64_u32(x)) != 0; | |
53 | } | ||
54 | |||
55 | 2624 | static float32x4_t specialcase(float32x4_t poly, float32x4_t n, | |
56 | uint32x4_t e) { | ||
57 | /* 2^n may overflow, break it up into s1*s2. */ | ||
58 | 2624 | uint32x4_t b = (n <= vdupq_n(0.0F)) & vdupq_n(0x83000000); | |
59 | 2624 | float32x4_t s1 = vreinterpretq_f32_u32(vdupq_n(0x7f000000) + b); | |
60 | 2624 | float32x4_t s2 = vreinterpretq_f32_u32(e - b); | |
61 | 2624 | uint32x4_t cmp = vcagtq_f32(n, vdupq_n(192.0F)); | |
62 | 2624 | float32x4_t r1 = s1 * s1; | |
63 | 2624 | float32x4_t r0 = (poly * s1) * s2; | |
64 | 7872 | return vreinterpretq_f32_u32((cmp & vreinterpretq_u32_f32(r1)) | | |
65 | 2624 | (~cmp & vreinterpretq_u32_f32(r0))); | |
66 | 2624 | } | |
67 | }; // end of class Exp<float> | ||
68 | |||
69 | template <typename T> | ||
70 | 81 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp(const T* src, size_t src_stride, | |
71 | T* dst, size_t dst_stride, | ||
72 | size_t width, size_t height) { | ||
73 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 79 times.
|
81 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
74 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 77 times.
|
79 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
75 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 75 times.
|
77 | CHECK_IMAGE_SIZE(width, height); |
76 | |||
77 | 75 | Exp<T> operation; | |
78 | 75 | Rectangle rect{width, height}; | |
79 | 75 | Rows<const T> src_rows{src, src_stride}; | |
80 | 75 | Rows<T> dst_rows{dst, dst_stride}; | |
81 | 75 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
82 | 75 | return KLEIDICV_OK; | |
83 | 81 | } | |
84 | |||
85 | #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ | ||
86 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp<type>( \ | ||
87 | const type* src, size_t src_stride, type* dst, size_t dst_stride, \ | ||
88 | size_t width, size_t height) | ||
89 | |||
90 | KLEIDICV_INSTANTIATE_TEMPLATE(float); | ||
91 | |||
92 | } // namespace kleidicv::neon | ||
93 |