| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <climits> | ||
| 6 | #include <cmath> | ||
| 7 | #include <cstdint> | ||
| 8 | #include <limits> | ||
| 9 | |||
| 10 | #include "kleidicv/arithmetics/scale.h" | ||
| 11 | #include "kleidicv/neon.h" | ||
| 12 | #include "kleidicv/traits.h" | ||
| 13 | |||
| 14 | namespace kleidicv::neon { | ||
| 15 | |||
| 16 | // Scale algorithm: for each value in the source, | ||
| 17 | // dst[i] = src[i] * scale + shift (floating point operation) | ||
| 18 | // | ||
| 19 | // Unsigned 8-bit implementation | ||
| 20 | // | ||
| 21 | // Since converting from uint8 to float32 and back takes more steps, | ||
| 22 | // 'ScaleTbx' saves time by pre-calculating all 256 values and uses TBLs | ||
| 23 | // and TBXs to map the values directly from uint8 to uint8: | ||
| 24 | // i: 0 to 255: tbl[i] = i * scale + shift | ||
| 25 | // | ||
| 26 | // Since a single TBL intruction can map only 16 values, more TBX instructions | ||
| 27 | // needed for the remaining 240 values. After the first TBL (that replaces | ||
| 28 | // 0-15 values with indexed values from the table) 16 is subtracted from all | ||
| 29 | // lanes in the source vector before the next TBX is done, so when indexing 0 | ||
| 30 | // to 15, actually 16 to 31 values are replaced from the original source vector. | ||
| 31 | // | ||
| 32 | // Example: | ||
| 33 | // scale = 1 | ||
| 34 | // shift = 100 | ||
| 35 | // Initialization: (it also takes time, so for short inputs it's not used) | ||
| 36 | // tbl = [ 100, 101, 102, ..., 255, <100 times 255, it's saturated>] | ||
| 37 | // Copy table to vector registers: | ||
| 38 | // t0 = [ 100, ..., 115 ] | ||
| 39 | // t1 = [ 116, ..., 131 ] | ||
| 40 | // t2 = [ 132, ..., 147 ] | ||
| 41 | // ... | ||
| 42 | // t15 = [ 255, ..., 255 ] | ||
| 43 | // | ||
| 44 | // input: v = [ 21, 3, 39, 6 ] | ||
| 45 | // TBL(t0): d = [ 0, 103, 0, 106 ] // index > 16 result in 0 | ||
| 46 | // SUB: v = [ 5, 243, 23, 246 ] // subtracted 16 --> next table | ||
| 47 | // TBX(t1): d = [ 121, 103, 0, 106 ] // index > 16 are ignored | ||
| 48 | // SUB: v = [ 245, 227, 7, 230 ] // subtracted 16 --> next table | ||
| 49 | // TBX(t2): d = [ 121, 103, 107, 106 ] // index > 16 are ignored | ||
| 50 | // ... etc. | ||
| 51 | // | ||
| 52 | // Bigger index tables (32, 48 or 64 values) can be used by TBX2 - TBX3 - TBX4. | ||
| 53 | // In this case, instead of 16, 2/3/4 * 16 have to be subtracted from source. | ||
| 54 | // The below solution (combining TBX2-TBX3) gives a good compromise between code | ||
| 55 | // size and speed. | ||
| 56 | |||
| 57 | template <typename ScalarType> | ||
| 58 | class ScaleIntBase : public UnrollTwice { | ||
| 59 | public: | ||
| 60 | 340 | ScaleIntBase(float scale, float shift) : scale_{scale}, shift_{shift} {} | |
| 61 | |||
| 62 | protected: | ||
| 63 | static constexpr ScalarType ScalarMax = | ||
| 64 | std::numeric_limits<ScalarType>::max(); | ||
| 65 | |||
| 66 | float scale_, shift_; | ||
| 67 | }; | ||
| 68 | |||
| 69 | template <typename T, typename U> | ||
| 70 | kleidicv_error_t scale(const T *src, size_t src_stride, U *dst, | ||
| 71 | size_t dst_stride, size_t width, size_t height, | ||
| 72 | double scale, double shift); | ||
| 73 | |||
| 74 | template <typename T> | ||
| 75 | 2570 | T scale_value(T value, double scale, double shift) { | |
| 76 | static constexpr T ScalarMax = std::numeric_limits<T>::max(); | ||
| 77 | 2570 | int64_t v = lrintf(static_cast<float>(value) * scale + shift); | |
| 78 |
2/2✓ Branch 0 taken 2166 times.
✓ Branch 1 taken 404 times.
|
2570 | if (static_cast<uint64_t>(v) <= ScalarMax) { |
| 79 | 2166 | return static_cast<T>(v); | |
| 80 | } | ||
| 81 | 404 | return static_cast<T>(v > 0 ? ScalarMax : 0); | |
| 82 | 2570 | } | |
| 83 | |||
| 84 | class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> { | ||
| 85 | public: | ||
| 86 | using ScalarType = uint8_t; | ||
| 87 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 88 | using VectorType = typename VecTraits::VectorType; | ||
| 89 | using Vector2Type = typename VecTraits::Vector2Type; | ||
| 90 | using Vector3Type = typename VecTraits::Vector3Type; | ||
| 91 | |||
| 92 | 168 | ScaleUint8Tbx(float scale, float shift, const ScalarType *precalculated_table) | |
| 93 | 168 | : ScaleIntBase<uint8_t>(scale, shift), | |
| 94 | 168 | table_pointer_(precalculated_table), | |
| 95 | 168 | v_step3_(vdupq_n_u8(3 * VecTraits::num_lanes())), | |
| 96 | 168 | v_step2_(vdupq_n_u8(2 * VecTraits::num_lanes())) { | |
| 97 | 168 | VecTraits::load(precalculated_table, t0_3_); | |
| 98 | 168 | VecTraits::load(precalculated_table + 3 * VecTraits::num_lanes(), t1_3_); | |
| 99 | 336 | VecTraits::load(precalculated_table + (3 + 3) * VecTraits::num_lanes(), | |
| 100 | 168 | t2_2_); | |
| 101 | 336 | VecTraits::load(precalculated_table + (3 + 3 + 2) * VecTraits::num_lanes(), | |
| 102 | 168 | t3_3_); | |
| 103 | 168 | VecTraits::load( | |
| 104 | 168 | precalculated_table + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_); | |
| 105 | 168 | VecTraits::load( | |
| 106 | 168 | precalculated_table + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(), | |
| 107 | 168 | t5_3_); | |
| 108 | 168 | } | |
| 109 | 1992 | VectorType vector_path(VectorType src) { | |
| 110 | 1992 | VectorType dst = vqtbl3q_u8(t0_3_, src); | |
| 111 | 1992 | src = vsubq_u8(src, v_step3_); | |
| 112 | 1992 | dst = vqtbx3q_u8(dst, t1_3_, src); | |
| 113 | 1992 | src = vsubq_u8(src, v_step3_); | |
| 114 | 1992 | dst = vqtbx2q_u8(dst, t2_2_, src); | |
| 115 | 1992 | src = vsubq_u8(src, v_step2_); | |
| 116 | 1992 | dst = vqtbx3q_u8(dst, t3_3_, src); | |
| 117 | 1992 | src = vsubq_u8(src, v_step3_); | |
| 118 | 1992 | dst = vqtbx2q_u8(dst, t4_2_, src); | |
| 119 | 1992 | src = vsubq_u8(src, v_step2_); | |
| 120 | 1992 | dst = vqtbx3q_u8(dst, t5_3_, src); | |
| 121 | 3984 | return dst; | |
| 122 | 1992 | } | |
| 123 | |||
| 124 | 1326 | ScalarType scalar_path(ScalarType src) { return table_pointer_[src]; } | |
| 125 | |||
| 126 | private: | ||
| 127 | const ScalarType *table_pointer_; | ||
| 128 | 168 | Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{}; | |
| 129 | 168 | Vector2Type t2_2_{}, t4_2_{}; | |
| 130 | VectorType v_step3_, v_step2_; | ||
| 131 | }; // end of class ScaleUint8Tbx<T> | ||
| 132 | |||
| 133 | // Opposite to ScaleUint8Tbx, ScaleUint8Calc is the direct approach: | ||
| 134 | // - calculate dst[i] = src[i] * scale + shift using vector instructions | ||
| 135 | class ScaleUint8Calc final : public ScaleIntBase<uint8_t> { | ||
| 136 | public: | ||
| 137 | using ScalarType = uint8_t; | ||
| 138 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 139 | using VectorType = typename VecTraits::VectorType; | ||
| 140 | |||
| 141 | 172 | ScaleUint8Calc(float scale, float shift) | |
| 142 | 172 | : ScaleIntBase<ScalarType>(scale, shift), | |
| 143 | 172 | vscale_{vdupq_n_f32(scale)}, | |
| 144 | 172 | vshift_{vdupq_n_f32(shift)} {} | |
| 145 | |||
| 146 | 1320 | VectorType vector_path(VectorType src) { | |
| 147 | // For scaling, uint8 values have to be converted to uint32 | ||
| 148 | // i.e. create four vectors from one | ||
| 149 | 1320 | uint32x4_t res11 = scale_shift(vqtbl1q_u8(src, w0)); | |
| 150 | 1320 | uint32x4_t res12 = scale_shift(vqtbl1q_u8(src, w1)); | |
| 151 | 1320 | uint32x4_t res21 = scale_shift(vqtbl1q_u8(src, w2)); | |
| 152 | 1320 | uint32x4_t res22 = scale_shift(vqtbl1q_u8(src, w3)); | |
| 153 | // Convert back from 32-bit: top two bytes are 0 for sure, unzip them | ||
| 154 | 2640 | uint16x8_t res1 = | |
| 155 | 1320 | vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12)); | |
| 156 | 2640 | uint16x8_t res2 = | |
| 157 | 1320 | vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22)); | |
| 158 | |||
| 159 | // Saturating narrowing from 16 to 8 bits | ||
| 160 | 2640 | return vqmovn_high_u16(vqmovn_u16(res1), res2); | |
| 161 | 1320 | } | |
| 162 | |||
| 163 | 2570 | ScalarType scalar_path(ScalarType src) { | |
| 164 | 2570 | return scale_value(src, scale_, shift_); | |
| 165 | } | ||
| 166 | |||
| 167 | private: | ||
| 168 | static constexpr ScalarType FF = std::numeric_limits<uint8_t>::max(); | ||
| 169 | // clang-format off | ||
| 170 | static constexpr uint8x16_t w0 = { 0, FF, FF, FF, 1, FF, FF, FF, 2, FF, FF, FF, 3, FF, FF, FF}; | ||
| 171 | static constexpr uint8x16_t w1 = { 4, FF, FF, FF, 5, FF, FF, FF, 6, FF, FF, FF, 7, FF, FF, FF}; | ||
| 172 | static constexpr uint8x16_t w2 = { 8, FF, FF, FF, 9, FF, FF, FF, 10, FF, FF, FF, 11, FF, FF, FF}; | ||
| 173 | static constexpr uint8x16_t w3 = {12, FF, FF, FF, 13, FF, FF, FF, 14, FF, FF, FF, 15, FF, FF, FF}; | ||
| 174 | // clang-format on | ||
| 175 | |||
| 176 | // Convert from uint32 to float32, scale and convert back with rounding | ||
| 177 | 5280 | inline uint32x4_t scale_shift(VectorType src) { | |
| 178 | 5280 | float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src)); | |
| 179 | // scale + shift is done by MLA | ||
| 180 | 10560 | return vcvtnq_u32_f32(vmlaq_f32(vshift_, fx, vscale_)); | |
| 181 | 5280 | } | |
| 182 | |||
| 183 | float32x4_t vscale_, vshift_; | ||
| 184 | }; // end of class ScaleUint8Calc<T> | ||
| 185 | |||
| 186 | 168 | kleidicv_error_t scale_with_precalculated_table_u8( | |
| 187 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 188 | size_t width, size_t height, double scale, double shift, | ||
| 189 | const std::array<uint8_t, 256> &precalculated_table) { | ||
| 190 | 168 | Rectangle rect{width, height}; | |
| 191 | 168 | Rows<const uint8_t> src_rows{src, src_stride}; | |
| 192 | 168 | Rows<uint8_t> dst_rows{dst, dst_stride}; | |
| 193 | 336 | ScaleUint8Tbx operation(static_cast<float>(scale), static_cast<float>(shift), | |
| 194 | 168 | precalculated_table.data()); | |
| 195 | 168 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 196 | |||
| 197 | 168 | return KLEIDICV_OK; | |
| 198 | 168 | } | |
| 199 | |||
| 200 | // Specialization for uint8_t to uint8_t | ||
| 201 | template <> | ||
| 202 | 200 | kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst, | |
| 203 | size_t dst_stride, size_t width, size_t height, | ||
| 204 | double scale, double shift) { | ||
| 205 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 196 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 196 times.
|
200 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 206 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 192 times.
|
196 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 207 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 188 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 184 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 184 times.
|
192 | CHECK_IMAGE_SIZE(width, height); |
| 208 | // For smaller inputs, the full calculation is the faster | ||
| 209 |
2/2✓ Branch 0 taken 172 times.
✓ Branch 1 taken 12 times.
|
184 | if (width * height < 675) { // empirical value |
| 210 | 172 | Rectangle rect{width, height}; | |
| 211 | 172 | Rows<const uint8_t> src_rows{src, src_stride}; | |
| 212 | 172 | Rows<uint8_t> dst_rows{dst, dst_stride}; | |
| 213 | 344 | ScaleUint8Calc operation(static_cast<float>(scale), | |
| 214 | 172 | static_cast<float>(shift)); | |
| 215 | 172 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 216 | 172 | } else { | |
| 217 | // For bigger inputs, it's faster to pre-calculate the table | ||
| 218 | // and map those values during the run | ||
| 219 | 12 | auto precalculated_table = precalculate_scale_table_u8(scale, shift); | |
| 220 | 24 | return scale_with_precalculated_table_u8(src, src_stride, dst, dst_stride, | |
| 221 | 12 | width, height, scale, shift, | |
| 222 | precalculated_table); | ||
| 223 | 12 | } | |
| 224 | 172 | return KLEIDICV_OK; | |
| 225 | 200 | } | |
| 226 | |||
| 227 | 7936 | static uint32x4_t scale_shift(uint32x4_t src, float scale, float shift) { | |
| 228 | 7936 | float32x4_t fx = vcvtq_f32_u32(src); | |
| 229 | 7936 | float32x4_t max = vdupq_n_f32(255.0F); | |
| 230 | 7936 | float32x4_t min = vdupq_n_f32(0.0F); | |
| 231 | 7936 | float32x4_t val = vmlaq_f32(vdupq_n_f32(shift), fx, vdupq_n_f32(scale)); | |
| 232 | 15872 | return vcvtnq_u32_f32(vmaxq_f32(min, vminq_f32(val, max))); | |
| 233 | 7936 | } | |
| 234 | |||
| 235 | 124 | std::array<uint8_t, 256> precalculate_scale_table_u8(double dscale, | |
| 236 | double dshift) { | ||
| 237 | 124 | float scale = static_cast<float>(dscale); | |
| 238 | 124 | float shift = static_cast<float>(dshift); | |
| 239 | static constexpr size_t TableLength = 256; | ||
| 240 | 124 | std::array<uint8_t, TableLength> precalculated_table{}; | |
| 241 | |||
| 242 | 124 | uint32x4_t counter = {0, 1, 2, 3}; | |
| 243 | 124 | uint32x4_t four = vdupq_n_u32(4); | |
| 244 | |||
| 245 |
2/2✓ Branch 0 taken 124 times.
✓ Branch 1 taken 1984 times.
|
2108 | for (size_t i = 0; i < TableLength; i += 16) { |
| 246 | 1984 | uint32x4_t res11 = scale_shift(counter, scale, shift); | |
| 247 | 1984 | counter = vaddq(counter, four); | |
| 248 | 1984 | uint32x4_t res12 = scale_shift(counter, scale, shift); | |
| 249 | 1984 | counter = vaddq(counter, four); | |
| 250 | 1984 | uint32x4_t res21 = scale_shift(counter, scale, shift); | |
| 251 | 1984 | counter = vaddq(counter, four); | |
| 252 | 1984 | uint32x4_t res22 = scale_shift(counter, scale, shift); | |
| 253 | 1984 | counter = vaddq(counter, four); | |
| 254 | |||
| 255 | 3968 | uint16x8_t res1 = | |
| 256 | 1984 | vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12)); | |
| 257 | 3968 | uint16x8_t res2 = | |
| 258 | 1984 | vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22)); | |
| 259 | // Saturating narrowing from 16 to 8 bits | ||
| 260 | 1984 | uint8x16_t res = vqmovn_high_u16(vqmovn_u16(res1), res2); | |
| 261 | |||
| 262 | 1984 | vst1q_u8(&precalculated_table[i], res); | |
| 263 | 1984 | } | |
| 264 | return precalculated_table; | ||
| 265 | 124 | } | |
| 266 | |||
| 267 | // ----------------------------------------------------------------------- | ||
| 268 | // Float implementation | ||
| 269 | // ----------------------------------------------------------------------- | ||
| 270 | |||
| 271 | class AddFloat final : public UnrollTwice, public UnrollOnce { | ||
| 272 | public: | ||
| 273 | using ScalarType = float; | ||
| 274 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 275 | using VectorType = typename VecTraits::VectorType; | ||
| 276 | |||
| 277 | 6 | explicit AddFloat(float shift) : shift_{shift}, vshift_{vdupq_n_f32(shift)} {} | |
| 278 | |||
| 279 | 5031 | VectorType vector_path(VectorType src) { return vaddq_f32(vshift_, src); } | |
| 280 | |||
| 281 | // NOLINTBEGIN(readability-make-member-function-const) | ||
| 282 | 13 | ScalarType scalar_path(ScalarType src) { return src + shift_; } | |
| 283 | // NOLINTEND(readability-make-member-function-const) | ||
| 284 | |||
| 285 | private: | ||
| 286 | float shift_; | ||
| 287 | float32x4_t vshift_; | ||
| 288 | }; // end of class AddFloat | ||
| 289 | |||
| 290 | class ScaleFloat final : public UnrollTwice, public UnrollOnce { | ||
| 291 | public: | ||
| 292 | using ScalarType = float; | ||
| 293 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 294 | using VectorType = typename VecTraits::VectorType; | ||
| 295 | |||
| 296 | 85 | ScaleFloat(float scale, float shift) | |
| 297 | 85 | : scale_{scale}, | |
| 298 | 85 | shift_{shift}, | |
| 299 | 85 | vscale_{vdupq_n_f32(scale)}, | |
| 300 | 85 | vshift_{vdupq_n_f32(shift)} {} | |
| 301 | |||
| 302 | 6854 | VectorType vector_path(VectorType src) { | |
| 303 | 6854 | return vmlaq_f32(vshift_, src, vscale_); | |
| 304 | } | ||
| 305 | |||
| 306 | // NOLINTBEGIN(readability-make-member-function-const) | ||
| 307 | 76 | ScalarType scalar_path(ScalarType src) { return src * scale_ + shift_; } | |
| 308 | // NOLINTEND(readability-make-member-function-const) | ||
| 309 | |||
| 310 | private: | ||
| 311 | float scale_, shift_; | ||
| 312 | float32x4_t vscale_, vshift_; | ||
| 313 | }; // end of class ScaleFloat | ||
| 314 | |||
| 315 | // Specialization for float to float | ||
| 316 | template <> | ||
| 317 | 97 | kleidicv_error_t scale(const float *src, size_t src_stride, float *dst, | |
| 318 | size_t dst_stride, size_t width, size_t height, | ||
| 319 | double scale, double shift) { | ||
| 320 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 95 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 95 times.
|
97 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 321 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 93 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 93 times.
|
95 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 322 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 92 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 91 times.
|
93 | CHECK_IMAGE_SIZE(width, height); |
| 323 | |||
| 324 | 91 | Rectangle rect{width, height}; | |
| 325 | 91 | Rows<const float> src_rows{src, src_stride}; | |
| 326 | 91 | Rows<float> dst_rows{dst, dst_stride}; | |
| 327 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 85 times.
|
91 | if (scale == 1.0) { |
| 328 | 6 | AddFloat operation(static_cast<float>(shift)); | |
| 329 | 6 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 330 | 6 | } else { | |
| 331 | 85 | ScaleFloat operation(static_cast<float>(scale), static_cast<float>(shift)); | |
| 332 | 85 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 333 | 85 | } | |
| 334 | 91 | return KLEIDICV_OK; | |
| 335 | 97 | } | |
| 336 | |||
| 337 | // ----------------------------------------------------------------------- | ||
| 338 | // Scale uint8 to float16 | ||
| 339 | // ----------------------------------------------------------------------- | ||
| 340 | |||
| 341 | class ScaleUint8ToFloat16 { | ||
| 342 | public: | ||
| 343 | using SrcType = uint8_t; | ||
| 344 | using SrcVecTraits = neon::VecTraits<SrcType>; | ||
| 345 | using SrcVectorType = typename SrcVecTraits::VectorType; | ||
| 346 | using SrcVector2Type = typename SrcVecTraits::Vector2Type; | ||
| 347 | using DstType = float16_t; | ||
| 348 | using DstVecTraits = neon::VecTraits<DstType>; | ||
| 349 | using DstVectorType = typename DstVecTraits::VectorType; | ||
| 350 | using DstVector2Type = typename DstVecTraits::Vector2Type; | ||
| 351 | using DstVector4Type = typename DstVecTraits::Vector4Type; | ||
| 352 | |||
| 353 | 96 | ScaleUint8ToFloat16(float scale, float shift) | |
| 354 | 96 | : scale_{scale}, | |
| 355 | 96 | shift_{shift}, | |
| 356 | 96 | vscale_{vdupq_n_f32(scale)}, | |
| 357 | 96 | vshift_{vdupq_n_f32(shift)} {} | |
| 358 | |||
| 359 | 191 | void process_row(size_t width, Columns<const SrcType> src, | |
| 360 | Columns<DstType> dst) { | ||
| 361 | 382 | LoopUnroll{width, SrcVecTraits::num_lanes()} | |
| 362 | 671 | .unroll_twice([&](size_t step) { | |
| 363 | 480 | SrcVector2Type src_2vec; | |
| 364 | 480 | SrcVecTraits::load(&src[0], src_2vec); | |
| 365 | 480 | DstVector2Type dst_2vec1 = vector_path(src_2vec.val[0]); | |
| 366 | 480 | DstVector2Type dst_2vec2 = vector_path(src_2vec.val[1]); | |
| 367 | 480 | DstVector4Type dst_4vec = { | |
| 368 | 1920 | dst_2vec1.val[0], | |
| 369 | 480 | dst_2vec1.val[1], | |
| 370 | 480 | dst_2vec2.val[0], | |
| 371 | 480 | dst_2vec2.val[1], | |
| 372 | }; | ||
| 373 | 480 | DstVecTraits::store(dst_4vec, &dst[0]); | |
| 374 | 480 | src += ptrdiff_t(step); | |
| 375 | 480 | dst += ptrdiff_t(step); | |
| 376 | 480 | }) | |
| 377 | 317 | .remaining([&](size_t length, size_t) { | |
| 378 |
2/2✓ Branch 0 taken 126 times.
✓ Branch 1 taken 1807 times.
|
1933 | for (ptrdiff_t index = 0; index < static_cast<ptrdiff_t>(length); |
| 379 | 1807 | ++index) { | |
| 380 | 1807 | disable_loop_vectorization(); | |
| 381 | 1807 | dst[index] = static_cast<float16_t>( | |
| 382 | 1807 | static_cast<float>(src[index]) * scale_ + shift_); | |
| 383 | 1807 | } | |
| 384 | 126 | }); | |
| 385 | 191 | } | |
| 386 | |||
| 387 | private: | ||
| 388 | 960 | DstVector2Type vector_path(SrcVectorType src) { | |
| 389 | // For scaling, uint8 values have to be converted to uint32 | ||
| 390 | // i.e. create four vectors from one | ||
| 391 | 960 | float32x4_t res0 = scale_shift(vqtbl1q_u8(src, kW0)); | |
| 392 | 960 | float32x4_t res1 = scale_shift(vqtbl1q_u8(src, kW1)); | |
| 393 | 960 | float32x4_t res2 = scale_shift(vqtbl1q_u8(src, kW2)); | |
| 394 | 960 | float32x4_t res3 = scale_shift(vqtbl1q_u8(src, kW3)); | |
| 395 | // Convert from 32-bit to 16-bit | ||
| 396 | 960 | float16x4_t res16_0 = vcvt_f16_f32(res0); | |
| 397 | 960 | float16x4_t res16_2 = vcvt_f16_f32(res2); | |
| 398 | DstVector2Type res; | ||
| 399 | 960 | res.val[0] = vcvt_high_f16_f32(res16_0, res1); | |
| 400 | 960 | res.val[1] = vcvt_high_f16_f32(res16_2, res3); | |
| 401 | return res; | ||
| 402 | 960 | } | |
| 403 | |||
| 404 | // Convert from uint32 to float32 and scale it | ||
| 405 | 3840 | inline float32x4_t scale_shift(SrcVectorType src) { | |
| 406 | 3840 | float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src)); | |
| 407 | 7680 | return vmlaq_f32(vshift_, fx, vscale_); | |
| 408 | 3840 | } | |
| 409 | |||
| 410 | static constexpr SrcType kFF = std::numeric_limits<SrcType>::max(); | ||
| 411 | // clang-format off | ||
| 412 | static constexpr uint8x16_t kW0 = { 0, kFF, kFF, kFF, 1, kFF, kFF, kFF, 2, kFF, kFF, kFF, 3, kFF, kFF, kFF}; | ||
| 413 | static constexpr uint8x16_t kW1 = { 4, kFF, kFF, kFF, 5, kFF, kFF, kFF, 6, kFF, kFF, kFF, 7, kFF, kFF, kFF}; | ||
| 414 | static constexpr uint8x16_t kW2 = { 8, kFF, kFF, kFF, 9, kFF, kFF, kFF, 10, kFF, kFF, kFF, 11, kFF, kFF, kFF}; | ||
| 415 | static constexpr uint8x16_t kW3 = {12, kFF, kFF, kFF, 13, kFF, kFF, kFF, 14, kFF, kFF, kFF, 15, kFF, kFF, kFF}; | ||
| 416 | // clang-format on | ||
| 417 | |||
| 418 | float scale_, shift_; | ||
| 419 | float32x4_t vscale_, vshift_; | ||
| 420 | }; // end of class ScaleUint8ToFloat16 | ||
| 421 | |||
| 422 | // Specialization for uint8_t to float16_t | ||
| 423 | template <> | ||
| 424 | 100 | kleidicv_error_t scale(const uint8_t *src, size_t src_stride, float16_t *dst, | |
| 425 | size_t dst_stride, size_t width, size_t height, | ||
| 426 | double scale, double shift) { | ||
| 427 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 99 times.
|
100 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 428 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 98 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 98 times.
|
99 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 429 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 97 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 96 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 96 times.
|
98 | CHECK_IMAGE_SIZE(width, height); |
| 430 | |||
| 431 | 96 | Rectangle rect{width, height}; | |
| 432 | 96 | Rows<const uint8_t> src_rows{src, src_stride}; | |
| 433 | 96 | Rows<float16_t> dst_rows{dst, dst_stride}; | |
| 434 | 192 | ScaleUint8ToFloat16 operation(static_cast<float>(scale), | |
| 435 | 96 | static_cast<float>(shift)); | |
| 436 | 96 | zip_rows(operation, rect, src_rows, dst_rows); | |
| 437 | 96 | return KLEIDICV_OK; | |
| 438 | 100 | } | |
| 439 | |||
| 440 | } // namespace kleidicv::neon | ||
| 441 |