| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_FLOAT_CONV_SC_H | ||
| 6 | #define KLEIDICV_FLOAT_CONV_SC_H | ||
| 7 | |||
| 8 | #include <limits> | ||
| 9 | #include <type_traits> | ||
| 10 | |||
| 11 | #include "kleidicv/kleidicv.h" | ||
| 12 | #include "kleidicv/sve2.h" | ||
| 13 | |||
| 14 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 15 | |||
| 16 | template <typename InputType, typename OutputType> | ||
| 17 | class float_conversion_operation; | ||
| 18 | |||
| 19 | template <typename OutputType> | ||
| 20 | class float_conversion_operation<float, OutputType> { | ||
| 21 | public: | ||
| 22 | using SrcVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
| 23 | using SrcVectorType = typename SrcVecTraits::VectorType; | ||
| 24 | using SrcVector4Type = typename SrcVecTraits::Vector4Type; | ||
| 25 | using IntermediateVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits< | ||
| 26 | std::conditional_t<std::is_signed_v<OutputType>, int32_t, uint32_t>>; | ||
| 27 | using IntermediateVectorType = typename IntermediateVecTraits::VectorType; | ||
| 28 | using DstVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<OutputType>; | ||
| 29 | using DstVectorType = typename DstVecTraits::VectorType; | ||
| 30 | |||
| 31 | 450 | explicit float_conversion_operation(svuint8_t& index) KLEIDICV_STREAMING | |
| 32 | 450 | : svqvct_wrapper(index) {} | |
| 33 | |||
| 34 | 692 | void process_row(size_t width, Columns<const float> src, | |
| 35 | Columns<OutputType> dst) KLEIDICV_STREAMING { | ||
| 36 | 1384 | LoopUnroll{width, SrcVecTraits::num_lanes()} | |
| 37 | 2028 | .unroll_n_times<4>([&](size_t step) KLEIDICV_STREAMING { | |
| 38 | 1336 | svbool_t pg = DstVecTraits::svptrue(); | |
| 39 | #if KLEIDICV_TARGET_SME2 | ||
| 40 | 206 | svcount_t pg_counter = DstVecTraits::svptrue_c(); | |
| 41 | 206 | SrcVector4Type src4 = svld1_x4(pg_counter, &src[0]); | |
| 42 | 206 | SrcVectorType src_v0 = svget4(src4, 0); | |
| 43 | 206 | SrcVectorType src_v1 = svget4(src4, 1); | |
| 44 | 206 | SrcVectorType src_v2 = svget4(src4, 2); | |
| 45 | 206 | SrcVectorType src_v3 = svget4(src4, 3); | |
| 46 | #else | ||
| 47 | 1130 | SrcVectorType src_v0 = svld1(pg, &src[0]); | |
| 48 | 1130 | SrcVectorType src_v1 = svld1_vnum(pg, &src[0], 1); | |
| 49 | 1130 | SrcVectorType src_v2 = svld1_vnum(pg, &src[0], 2); | |
| 50 | 1130 | SrcVectorType src_v3 = svld1_vnum(pg, &src[0], 3); | |
| 51 | #endif // KLEIDICV_TARGET_SME2 | ||
| 52 | 1336 | DstVectorType res0 = vector_path(pg, src_v0, src_v1, src_v2, src_v3); | |
| 53 | 1336 | svst1(pg, &dst[0], res0); | |
| 54 | 1336 | src += ptrdiff_t(step); | |
| 55 | 1336 | dst += ptrdiff_t(step); | |
| 56 | 1336 | }) | |
| 57 | 1228 | .remaining([&](size_t length, size_t) KLEIDICV_STREAMING { | |
| 58 | 536 | size_t index = 0; | |
| 59 | 536 | svbool_t pg = SrcVecTraits::svwhilelt(index, length); | |
| 60 |
4/4✓ Branch 0 taken 432 times.
✓ Branch 1 taken 268 times.
✓ Branch 2 taken 432 times.
✓ Branch 3 taken 268 times.
|
1400 | while (svptest_first(SrcVecTraits::svptrue(), pg)) { |
| 61 | 864 | SrcVectorType src_vector = svld1(pg, &src[ptrdiff_t(index)]); | |
| 62 | 1728 | IntermediateVectorType result_vector = | |
| 63 | 864 | remaining_path<OutputType>(pg, src_vector); | |
| 64 | 864 | svst1b(pg, &dst[ptrdiff_t(index)], result_vector); | |
| 65 | // Update loop counter and calculate the next governing predicate. | ||
| 66 | 864 | index += SrcVecTraits::num_lanes(); | |
| 67 | 864 | pg = SrcVecTraits::svwhilelt(index, length); | |
| 68 | 864 | } | |
| 69 | 536 | }); | |
| 70 | 692 | } | |
| 71 | |||
| 72 | private: | ||
| 73 | template < | ||
| 74 | typename O, | ||
| 75 | std::enable_if_t<std::is_integral_v<O> && std::is_signed_v<O>, int> = 0> | ||
| 76 | 2672 | decltype(auto) convert(svbool_t full_pg, | |
| 77 | SrcVectorType in) KLEIDICV_STREAMING { | ||
| 78 | 2672 | return svcvt_s32_f32_x(full_pg, in); | |
| 79 | } | ||
| 80 | |||
| 81 | template < | ||
| 82 | typename O, | ||
| 83 | std::enable_if_t<std::is_integral_v<O> && !std::is_signed_v<O>, int> = 0> | ||
| 84 | 2672 | decltype(auto) convert(svbool_t full_pg, | |
| 85 | SrcVectorType in) KLEIDICV_STREAMING { | ||
| 86 | 2672 | return svcvt_u32_f32_x(full_pg, in); | |
| 87 | } | ||
| 88 | |||
| 89 | 1336 | DstVectorType vector_path(svbool_t full_pg, SrcVectorType fsrc0, | |
| 90 | SrcVectorType fsrc1, SrcVectorType fsrc2, | ||
| 91 | SrcVectorType fsrc3) KLEIDICV_STREAMING { | ||
| 92 | 1336 | fsrc0 = svrinti_f32_x(full_pg, fsrc0); | |
| 93 | 1336 | fsrc1 = svrinti_f32_x(full_pg, fsrc1); | |
| 94 | 1336 | fsrc2 = svrinti_f32_x(full_pg, fsrc2); | |
| 95 | 1336 | fsrc3 = svrinti_f32_x(full_pg, fsrc3); | |
| 96 | |||
| 97 | 1336 | auto _32bit_res0 = convert<OutputType>(full_pg, fsrc0); | |
| 98 | 1336 | auto _32bit_res1 = convert<OutputType>(full_pg, fsrc1); | |
| 99 | 1336 | auto _32bit_res2 = convert<OutputType>(full_pg, fsrc2); | |
| 100 | 1336 | auto _32bit_res3 = convert<OutputType>(full_pg, fsrc3); | |
| 101 | |||
| 102 | 3802 | return svqvct_wrapper( | |
| 103 | 1336 | svcreate4(_32bit_res0, _32bit_res1, _32bit_res2, _32bit_res3)); | |
| 104 | 1336 | } | |
| 105 | |||
| 106 | template < | ||
| 107 | typename O, | ||
| 108 | std::enable_if_t<std::is_integral_v<O> && std::is_signed_v<O>, int> = 0> | ||
| 109 | 432 | IntermediateVectorType remaining_path(svbool_t& pg, | |
| 110 | SrcVectorType src) KLEIDICV_STREAMING { | ||
| 111 | 432 | constexpr float min_val = std::numeric_limits<O>::lowest(); | |
| 112 | 432 | constexpr float max_val = std::numeric_limits<O>::max(); | |
| 113 | |||
| 114 | 432 | src = svrinti_f32_x(pg, src); | |
| 115 | |||
| 116 | 432 | svbool_t less = svcmplt_n_f32(pg, src, min_val); | |
| 117 | 432 | src = svdup_n_f32_m(src, less, min_val); | |
| 118 | |||
| 119 | 432 | svbool_t greater = svcmpgt_n_f32(pg, src, max_val); | |
| 120 | 432 | src = svdup_n_f32_m(src, greater, max_val); | |
| 121 | |||
| 122 | 864 | return svcvt_s32_f32_x(pg, src); | |
| 123 | 432 | } | |
| 124 | |||
| 125 | template < | ||
| 126 | typename O, | ||
| 127 | std::enable_if_t<std::is_integral_v<O> && !std::is_signed_v<O>, int> = 0> | ||
| 128 | 432 | IntermediateVectorType remaining_path(svbool_t& pg, | |
| 129 | SrcVectorType src) KLEIDICV_STREAMING { | ||
| 130 | 432 | constexpr float max_val = std::numeric_limits<O>::max(); | |
| 131 | |||
| 132 | 432 | src = svrinti_f32_x(pg, src); | |
| 133 | |||
| 134 | 432 | svbool_t greater = svcmpgt_n_f32(pg, src, max_val); | |
| 135 | 432 | src = svdup_n_f32_m(src, greater, max_val); | |
| 136 | |||
| 137 | 864 | return svcvt_u32_f32_x(pg, src); | |
| 138 | 432 | } | |
| 139 | |||
| 140 | SvqvctWrapper svqvct_wrapper; | ||
| 141 | }; // end of class float_conversion_operation<float, OutputType> | ||
| 142 | |||
| 143 | template <typename InputType> | ||
| 144 | class float_conversion_operation<InputType, float> { | ||
| 145 | public: | ||
| 146 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
| 147 | using VectorType = typename VecTraits::VectorType; | ||
| 148 | using Vector2Type = typename VecTraits::Vector2Type; | ||
| 149 | |||
| 150 | 450 | explicit float_conversion_operation(svuint8_t&) {} | |
| 151 | |||
| 152 | 692 | void process_row(size_t width, Columns<const InputType> src, | |
| 153 | Columns<float> dst) KLEIDICV_STREAMING { | ||
| 154 | 1384 | LoopUnroll{width, VecTraits::num_lanes()} | |
| 155 | 3486 | .unroll_twice([&](size_t step) KLEIDICV_STREAMING { | |
| 156 | 2794 | svbool_t pg = VecTraits::svptrue(); | |
| 157 | 2794 | auto src_vect1 = load_src(pg, &src[0], 0); | |
| 158 | 2794 | auto src_vect2 = load_src(pg, &src[0], 1); | |
| 159 | |||
| 160 | 2794 | VectorType dst_vector1 = vector_path(pg, src_vect1); | |
| 161 | 2794 | VectorType dst_vector2 = vector_path(pg, src_vect2); | |
| 162 | #if KLEIDICV_TARGET_SME2 | ||
| 163 | 450 | svcount_t pg_counter = VecTraits::svptrue_c(); | |
| 164 | 450 | Vector2Type res2 = svcreate2(dst_vector1, dst_vector2); | |
| 165 | 450 | svst1(pg_counter, &dst[0], res2); | |
| 166 | #else | ||
| 167 | 2344 | svst1(pg, &dst[0], dst_vector1); | |
| 168 | 2344 | svst1_vnum(pg, &dst[0], 1, dst_vector2); | |
| 169 | #endif // KLEIDICV_TARGET_SME2 | ||
| 170 | 2794 | src += ptrdiff_t(step); | |
| 171 | 2794 | dst += ptrdiff_t(step); | |
| 172 | 2794 | }) | |
| 173 | 1174 | .remaining([&](size_t length, size_t) KLEIDICV_STREAMING { | |
| 174 | 482 | size_t index = 0; | |
| 175 | 482 | svbool_t pg = VecTraits::svwhilelt(index, length); | |
| 176 |
4/4✓ Branch 0 taken 302 times.
✓ Branch 1 taken 241 times.
✓ Branch 2 taken 302 times.
✓ Branch 3 taken 241 times.
|
1086 | while (svptest_first(VecTraits::svptrue(), pg)) { |
| 177 | 604 | auto src_vect = load_src(pg, &src[ptrdiff_t(index)], 0); | |
| 178 | 604 | VectorType dst_vector = vector_path(pg, src_vect); | |
| 179 | 604 | svst1(pg, &dst[ptrdiff_t(index)], dst_vector); | |
| 180 | // Update loop counter and calculate the next governing predicate. | ||
| 181 | 604 | index += VecTraits::num_lanes(); | |
| 182 | 604 | pg = VecTraits::svwhilelt(index, length); | |
| 183 | 604 | } | |
| 184 | 482 | }); | |
| 185 | 692 | } | |
| 186 | |||
| 187 | private: | ||
| 188 | template <typename I, std::enable_if_t<std::is_same_v<I, svint32_t>, int> = 0> | ||
| 189 | 3096 | VectorType vector_path(svbool_t& pg, I src_vector) KLEIDICV_STREAMING { | |
| 190 | 3096 | return svcvt_f32_s32_x(pg, src_vector); | |
| 191 | } | ||
| 192 | template <typename I, | ||
| 193 | std::enable_if_t<std::is_same_v<I, svuint32_t>, int> = 0> | ||
| 194 | 3096 | VectorType vector_path(svbool_t& pg, I src_vector) KLEIDICV_STREAMING { | |
| 195 | 3096 | return svcvt_f32_u32_x(pg, src_vector); | |
| 196 | } | ||
| 197 | |||
| 198 | template < | ||
| 199 | typename I, | ||
| 200 | std::enable_if_t<std::is_integral_v<I> && std::is_signed_v<I>, int> = 0> | ||
| 201 | 3096 | svint32_t load_src(svbool_t& pg, const I* src, | |
| 202 | size_t vnum) KLEIDICV_STREAMING { | ||
| 203 | 3096 | svint32_t src_vect = svld1sb_vnum_s32(pg, src, vnum); | |
| 204 | 6192 | return src_vect; | |
| 205 | 3096 | } | |
| 206 | |||
| 207 | template < | ||
| 208 | typename I, | ||
| 209 | std::enable_if_t<std::is_integral_v<I> && !std::is_signed_v<I>, int> = 0> | ||
| 210 | 3096 | svuint32_t load_src(svbool_t& pg, const I* src, | |
| 211 | size_t vnum) KLEIDICV_STREAMING { | ||
| 212 | 3096 | svuint32_t src_vect = svld1ub_vnum_u32(pg, src, vnum); | |
| 213 | 6192 | return src_vect; | |
| 214 | 3096 | } | |
| 215 | }; // end of class float_conversion_operation<InputType, float> | ||
| 216 | |||
| 217 | template <typename InputType, typename OutputType> | ||
| 218 | 960 | static kleidicv_error_t float_conversion_sc(const InputType* src, | |
| 219 | size_t src_stride, OutputType* dst, | ||
| 220 | size_t dst_stride, size_t width, | ||
| 221 | size_t height) KLEIDICV_STREAMING { | ||
| 222 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 237 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 237 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 237 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 237 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 237 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 237 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 237 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 237 times.
|
960 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 223 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 234 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 234 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 234 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 234 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 234 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 234 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 234 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 234 times.
|
948 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 224 |
24/24✓ Branch 0 taken 3 times.
✓ Branch 1 taken 231 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 225 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 225 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 231 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 225 times.
✓ Branch 10 taken 9 times.
✓ Branch 11 taken 225 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 231 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 225 times.
✓ Branch 16 taken 9 times.
✓ Branch 17 taken 225 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 231 times.
✓ Branch 20 taken 6 times.
✓ Branch 21 taken 225 times.
✓ Branch 22 taken 9 times.
✓ Branch 23 taken 225 times.
|
936 | CHECK_IMAGE_SIZE(width, height); |
| 225 | |||
| 226 | 900 | svuint8_t index; | |
| 227 | 900 | float_conversion_operation<InputType, OutputType> operation{index}; | |
| 228 | 900 | Rectangle rect{width, height}; | |
| 229 | 900 | Rows<const InputType> src_rows{src, src_stride}; | |
| 230 | 900 | Rows<OutputType> dst_rows{dst, dst_stride}; | |
| 231 | 900 | zip_rows(operation, rect, src_rows, dst_rows); | |
| 232 | |||
| 233 | 900 | return KLEIDICV_OK; | |
| 234 | 960 | } | |
| 235 | |||
| 236 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 237 | |||
| 238 | #endif // KLEIDICV_FLOAT_CONV_SC_H | ||
| 239 |