| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_EXP_SC_H | ||
| 6 | #define KLEIDICV_EXP_SC_H | ||
| 7 | |||
| 8 | #include "kleidicv/arithmetics/exp_constants.h" | ||
| 9 | #include "kleidicv/kleidicv.h" | ||
| 10 | #include "kleidicv/sve2.h" | ||
| 11 | |||
| 12 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 13 | template <typename ScalarType, bool TryShortPath> | ||
| 14 | class Exp; | ||
| 15 | |||
| 16 | template <bool TryShortPath> | ||
| 17 | class Exp<float, TryShortPath> final : public UnrollTwice { | ||
| 18 | public: | ||
| 19 | using ContextType = Context; | ||
| 20 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
| 21 | using VectorType = typename VecTraits::VectorType; | ||
| 22 | |||
| 23 | 6097 | VectorType vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { | |
| 24 | 6097 | svfloat32_t n, r, poly, z; | |
| 25 | 6097 | svuint32_t e; | |
| 26 | |||
| 27 | /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] | ||
| 28 | x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ | ||
| 29 | 6097 | z = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kShift), src, | |
| 30 | exp_f32::kInvLn2); | ||
| 31 | 6097 | n = svsub_x(ctx.predicate(), z, exp_f32::kShift); | |
| 32 | 6097 | r = svmla_x(ctx.predicate(), src, n, -exp_f32::kLn2Hi); | |
| 33 | 6097 | r = svmla_x(ctx.predicate(), r, n, -exp_f32::kLn2Lo); | |
| 34 | 6097 | e = svlsl_x(ctx.predicate(), svreinterpret_u32(z), 23); | |
| 35 | 12194 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[1]), | |
| 36 | 6097 | svdup_f32(exp_f32::kPoly[0]), r); | |
| 37 | 6097 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[2]), poly, r); | |
| 38 | 6097 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[3]), poly, r); | |
| 39 | 6097 | poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[4]), poly, r); | |
| 40 | 6097 | poly = svmla_x(ctx.predicate(), svdup_f32(1.0F), poly, r); | |
| 41 | 6097 | poly = svmla_x(ctx.predicate(), svdup_f32(1.0F), poly, r); | |
| 42 | |||
| 43 | if constexpr (TryShortPath) { | ||
| 44 | 2937 | svbool_t cmp = svacgt(ctx.predicate(), n, 126.0F); | |
| 45 |
2/2✓ Branch 0 taken 2641 times.
✓ Branch 1 taken 296 times.
|
2937 | if (KLEIDICV_UNLIKELY(svptest_any(ctx.predicate(), cmp))) { |
| 46 | 2641 | return specialcase(ctx.predicate(), poly, n, e); | |
| 47 | } | ||
| 48 | 592 | svfloat32_t scale = | |
| 49 | 296 | svreinterpret_f32(svadd_x(ctx.predicate(), e, 0x3f800000U)); | |
| 50 | 296 | return svmul_x(ctx.predicate(), scale, poly); | |
| 51 | 2937 | } | |
| 52 | |||
| 53 | 6320 | return specialcase(ctx.predicate(), poly, n, e); | |
| 54 | 6097 | } | |
| 55 | |||
| 56 | private: | ||
| 57 | 5801 | static svfloat32_t specialcase(svbool_t pg, svfloat32_t poly, svfloat32_t n, | |
| 58 | svuint32_t e) KLEIDICV_STREAMING { | ||
| 59 | /* 2^n may overflow, break it up into s1*s2. */ | ||
| 60 | 11602 | svuint32_t b = svsel(svcmple(pg, n, svdup_f32(0.0F)), | |
| 61 | 5801 | svdup_u32(0x83000000U), svdup_u32(0.0F)); | |
| 62 | 5801 | svfloat32_t s1 = svreinterpret_f32(svadd_x(pg, b, 0x7f000000U)); | |
| 63 | 5801 | svfloat32_t s2 = svreinterpret_f32(svsub_x(pg, e, b)); | |
| 64 | 5801 | svbool_t cmp = svacgt(pg, n, 192.0F); | |
| 65 | 5801 | svfloat32_t r1 = svmul_x(pg, s1, s1); | |
| 66 | 5801 | svfloat32_t r0 = svmul_x(pg, s2, svmul_x(pg, poly, s1)); | |
| 67 | |||
| 68 | 11602 | return svsel(cmp, r1, r0); | |
| 69 | 5801 | } | |
| 70 | }; // end of class Exp<float> | ||
| 71 | |||
| 72 | template <typename T> | ||
| 73 | using ExpNoShortPath = Exp<T, false>; | ||
| 74 | |||
| 75 | template <typename T> | ||
| 76 | using ExpTryShortPath = Exp<T, true>; | ||
| 77 | |||
| 78 | template <typename T, typename Operation> | ||
| 79 | 243 | static kleidicv_error_t exp_sc(const T* src, size_t src_stride, T* dst, | |
| 80 | size_t dst_stride, size_t width, | ||
| 81 | size_t height) KLEIDICV_STREAMING { | ||
| 82 |
4/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 237 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 237 times.
|
243 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 83 |
4/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 231 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 231 times.
|
237 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 84 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 228 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 225 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 225 times.
|
231 | CHECK_IMAGE_SIZE(width, height); |
| 85 | |||
| 86 | 225 | Operation operation; | |
| 87 | 225 | Rectangle rect{width, height}; | |
| 88 | 225 | Rows<const T> src_rows{src, src_stride}; | |
| 89 | 225 | Rows<T> dst_rows{dst, dst_stride}; | |
| 90 | 225 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 91 | 225 | return KLEIDICV_OK; | |
| 92 | 243 | } | |
| 93 | |||
| 94 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 95 | |||
| 96 | #endif // KLEIDICV_EXP_SC_H | ||
| 97 |