Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_EXP_SC_H | ||
6 | #define KLEIDICV_EXP_SC_H | ||
7 | |||
8 | #include "kleidicv/arithmetics/exp_constants.h" | ||
9 | #include "kleidicv/kleidicv.h" | ||
10 | #include "kleidicv/sve2.h" | ||
11 | |||
12 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
13 | template <typename ScalarType, bool TryShortPath> | ||
14 | class Exp; | ||
15 | |||
16 | template <bool TryShortPath> | ||
17 | class Exp<float, TryShortPath> final : public UnrollOnce { | ||
18 | public: | ||
19 | using ContextType = Context; | ||
20 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
21 | using VectorType = typename VecTraits::VectorType; | ||
22 | |||
23 | 4517 | VectorType vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { | |
24 | 4517 | svfloat32_t n, r, poly, z; | |
25 | 4517 | svuint32_t e; | |
26 | |||
27 | /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] | ||
28 | x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ | ||
29 | 4517 | z = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kShift), src, | |
30 | exp_f32::kInvLn2); | ||
31 | 4517 | n = svsub_x(ctx.predicate(), z, exp_f32::kShift); | |
32 | 4517 | r = svmla_x(ctx.predicate(), src, n, -exp_f32::kLn2Hi); | |
33 | 4517 | r = svmla_x(ctx.predicate(), r, n, -exp_f32::kLn2Lo); | |
34 | 4517 | e = svlsl_x(ctx.predicate(), svreinterpret_u32(z), 23); | |
35 | 9034 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[1]), | |
36 | 4517 | svdup_f32(exp_f32::kPoly[0]), r); | |
37 | 4517 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[2]), poly, r); | |
38 | 4517 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[3]), poly, r); | |
39 | 4517 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[4]), poly, r); | |
40 | 4517 | poly = svmla_x(ctx.predicate(), svdup_f32(1.0F), poly, r); | |
41 | 4517 | poly = svmla_x(ctx.predicate(), svdup_f32(1.0F), poly, r); | |
42 | |||
43 | if constexpr (TryShortPath) { | ||
44 | 2937 | svbool_t cmp = svacgt(ctx.predicate(), n, 126.0F); | |
45 |
2/2✓ Branch 0 taken 2638 times.
✓ Branch 1 taken 299 times.
|
2937 | if (KLEIDICV_UNLIKELY(svptest_any(ctx.predicate(), cmp))) { |
46 | 2638 | return specialcase(ctx.predicate(), poly, n, e); | |
47 | } | ||
48 | 598 | svfloat32_t scale = | |
49 | 299 | svreinterpret_f32(svadd_x(ctx.predicate(), e, 0x3f800000U)); | |
50 | 299 | return svmul_x(ctx.predicate(), scale, poly); | |
51 | 2937 | } | |
52 | |||
53 | 3160 | return specialcase(ctx.predicate(), poly, n, e); | |
54 | 4517 | } | |
55 | |||
56 | private: | ||
57 | 4218 | static svfloat32_t specialcase(svbool_t pg, svfloat32_t poly, svfloat32_t n, | |
58 | svuint32_t e) KLEIDICV_STREAMING { | ||
59 | /* 2^n may overflow, break it up into s1*s2. */ | ||
60 | 8436 | svuint32_t b = svsel(svcmple(pg, n, svdup_f32(0.0F)), | |
61 | 4218 | svdup_u32(0x83000000U), svdup_u32(0.0F)); | |
62 | 4218 | svfloat32_t s1 = svreinterpret_f32(svadd_x(pg, b, 0x7f000000U)); | |
63 | 4218 | svfloat32_t s2 = svreinterpret_f32(svsub_x(pg, e, b)); | |
64 | 4218 | svbool_t cmp = svacgt(pg, n, 192.0F); | |
65 | 4218 | svfloat32_t r1 = svmul_x(pg, s1, s1); | |
66 | 4218 | svfloat32_t r0 = svmul_x(pg, s2, svmul_x(pg, poly, s1)); | |
67 | |||
68 | 8436 | return svsel(cmp, r1, r0); | |
69 | 4218 | } | |
70 | }; // end of class Exp<float> | ||
71 | |||
72 | template <typename T> | ||
73 | using ExpNoShortPath = Exp<T, false>; | ||
74 | |||
75 | template <typename T> | ||
76 | using ExpTryShortPath = Exp<T, true>; | ||
77 | |||
78 | template <typename T, typename Operation> | ||
79 | 162 | static kleidicv_error_t exp_sc(const T* src, size_t src_stride, T* dst, | |
80 | size_t dst_stride, size_t width, | ||
81 | size_t height) KLEIDICV_STREAMING { | ||
82 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 158 times.
|
162 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
83 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 154 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 154 times.
|
158 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
84 |
6/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 150 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 150 times.
|
154 | CHECK_IMAGE_SIZE(width, height); |
85 | |||
86 | 150 | Operation operation; | |
87 | 150 | Rectangle rect{width, height}; | |
88 | 150 | Rows<const T> src_rows{src, src_stride}; | |
89 | 150 | Rows<T> dst_rows{dst, dst_stride}; | |
90 | 150 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
91 | 150 | return KLEIDICV_OK; | |
92 | 162 | } | |
93 | |||
94 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
95 | |||
96 | #endif // KLEIDICV_EXP_SC_H | ||
97 |