KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/arithmetics/scale_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 203 203 100.0%
Functions: 27 27 100.0%
Branches: 52 52 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <climits>
6 #include <cmath>
7 #include <cstdint>
8 #include <limits>
9
10 #include "kleidicv/arithmetics/scale.h"
11 #include "kleidicv/neon.h"
12 #include "kleidicv/traits.h"
13
14 namespace kleidicv::neon {
15
16 // Scale algorithm: for each value in the source,
17 // dst[i] = src[i] * scale + shift (floating point operation)
18 //
19 // Unsigned 8-bit implementation
20 //
21 // Since converting from uint8 to float32 and back takes more steps,
22 // 'ScaleTbx' saves time by pre-calculating all 256 values and uses TBLs
23 // and TBXs to map the values directly from uint8 to uint8:
24 // i: 0 to 255: tbl[i] = i * scale + shift
25 //
26 // Since a single TBL intruction can map only 16 values, more TBX instructions
27 // needed for the remaining 240 values. After the first TBL (that replaces
28 // 0-15 values with indexed values from the table) 16 is subtracted from all
29 // lanes in the source vector before the next TBX is done, so when indexing 0
30 // to 15, actually 16 to 31 values are replaced from the original source vector.
31 //
32 // Example:
33 // scale = 1
34 // shift = 100
35 // Initialization: (it also takes time, so for short inputs it's not used)
36 // tbl = [ 100, 101, 102, ..., 255, <100 times 255, it's saturated>]
37 // Copy table to vector registers:
38 // t0 = [ 100, ..., 115 ]
39 // t1 = [ 116, ..., 131 ]
40 // t2 = [ 132, ..., 147 ]
41 // ...
42 // t15 = [ 255, ..., 255 ]
43 //
44 // input: v = [ 21, 3, 39, 6 ]
45 // TBL(t0): d = [ 0, 103, 0, 106 ] // index > 16 result in 0
46 // SUB: v = [ 5, 243, 23, 246 ] // subtracted 16 --> next table
47 // TBX(t1): d = [ 121, 103, 0, 106 ] // index > 16 are ignored
48 // SUB: v = [ 245, 227, 7, 230 ] // subtracted 16 --> next table
49 // TBX(t2): d = [ 121, 103, 107, 106 ] // index > 16 are ignored
50 // ... etc.
51 //
52 // Bigger index tables (32, 48 or 64 values) can be used by TBX2 - TBX3 - TBX4.
53 // In this case, instead of 16, 2/3/4 * 16 have to be subtracted from source.
54 // The below solution (combining TBX2-TBX3) gives a good compromise between code
55 // size and speed.
56
57 template <typename ScalarType>
58 class ScaleIntBase : public UnrollTwice {
59 public:
60 424 ScaleIntBase(float scale, float shift) : scale_{scale}, shift_{shift} {}
61
62 protected:
63 static constexpr ScalarType ScalarMax =
64 std::numeric_limits<ScalarType>::max();
65
66 float scale_, shift_;
67 };
68
69 template <typename T, typename U>
70 kleidicv_error_t scale(const T *src, size_t src_stride, U *dst,
71 size_t dst_stride, size_t width, size_t height,
72 double scale, double shift);
73
74 template <typename T>
75 2570 T scale_value(T value, double scale, double shift) {
76 static constexpr T ScalarMax = std::numeric_limits<T>::max();
77 2570 int64_t v = lrintf(static_cast<float>(value) * scale + shift);
78
2/2
✓ Branch 0 taken 2166 times.
✓ Branch 1 taken 404 times.
2570 if (static_cast<uint64_t>(v) <= ScalarMax) {
79 2166 return static_cast<T>(v);
80 }
81 404 return static_cast<T>(v > 0 ? ScalarMax : 0);
82 2570 }
83
84 class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
85 public:
86 using ScalarType = uint8_t;
87 using VecTraits = neon::VecTraits<ScalarType>;
88 using VectorType = typename VecTraits::VectorType;
89 using Vector2Type = typename VecTraits::Vector2Type;
90 using Vector3Type = typename VecTraits::Vector3Type;
91
92 252 ScaleUint8Tbx(float scale, float shift, const ScalarType *precalculated_table)
93 252 : ScaleIntBase<uint8_t>(scale, shift),
94 252 table_pointer_(precalculated_table),
95 252 v_step3_(vdupq_n_u8(3 * VecTraits::num_lanes())),
96 252 v_step2_(vdupq_n_u8(2 * VecTraits::num_lanes())) {
97 252 VecTraits::load(precalculated_table, t0_3_);
98 252 VecTraits::load(precalculated_table + 3 * VecTraits::num_lanes(), t1_3_);
99 504 VecTraits::load(precalculated_table + (3 + 3) * VecTraits::num_lanes(),
100 252 t2_2_);
101 504 VecTraits::load(precalculated_table + (3 + 3 + 2) * VecTraits::num_lanes(),
102 252 t3_3_);
103 252 VecTraits::load(
104 252 precalculated_table + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_);
105 252 VecTraits::load(
106 252 precalculated_table + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(),
107 252 t5_3_);
108 252 }
109 1888 VectorType vector_path(VectorType src) {
110 1888 VectorType dst = vqtbl3q_u8(t0_3_, src);
111 1888 src = vsubq_u8(src, v_step3_);
112 1888 dst = vqtbx3q_u8(dst, t1_3_, src);
113 1888 src = vsubq_u8(src, v_step3_);
114 1888 dst = vqtbx2q_u8(dst, t2_2_, src);
115 1888 src = vsubq_u8(src, v_step2_);
116 1888 dst = vqtbx3q_u8(dst, t3_3_, src);
117 1888 src = vsubq_u8(src, v_step3_);
118 1888 dst = vqtbx2q_u8(dst, t4_2_, src);
119 1888 src = vsubq_u8(src, v_step2_);
120 1888 dst = vqtbx3q_u8(dst, t5_3_, src);
121 3776 return dst;
122 1888 }
123
124 2990 ScalarType scalar_path(ScalarType src) { return table_pointer_[src]; }
125
126 private:
127 const ScalarType *table_pointer_;
128 252 Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{};
129 252 Vector2Type t2_2_{}, t4_2_{};
130 VectorType v_step3_, v_step2_;
131 }; // end of class ScaleUint8Tbx<T>
132
133 // Opposite to ScaleUint8Tbx, ScaleUint8Calc is the direct approach:
134 // - calculate dst[i] = src[i] * scale + shift using vector instructions
135 class ScaleUint8Calc final : public ScaleIntBase<uint8_t> {
136 public:
137 using ScalarType = uint8_t;
138 using VecTraits = neon::VecTraits<ScalarType>;
139 using VectorType = typename VecTraits::VectorType;
140
141 172 ScaleUint8Calc(float scale, float shift)
142 172 : ScaleIntBase<ScalarType>(scale, shift),
143 172 vscale_{vdupq_n_f32(scale)},
144 172 vshift_{vdupq_n_f32(shift)} {}
145
146 1320 VectorType vector_path(VectorType src) {
147 // For scaling, uint8 values have to be converted to uint32
148 // i.e. create four vectors from one
149 1320 uint32x4_t res11 = scale_shift(vqtbl1q_u8(src, w0));
150 1320 uint32x4_t res12 = scale_shift(vqtbl1q_u8(src, w1));
151 1320 uint32x4_t res21 = scale_shift(vqtbl1q_u8(src, w2));
152 1320 uint32x4_t res22 = scale_shift(vqtbl1q_u8(src, w3));
153 // Convert back from 32-bit: top two bytes are 0 for sure, unzip them
154 2640 uint16x8_t res1 =
155 1320 vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
156 2640 uint16x8_t res2 =
157 1320 vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
158
159 // Saturating narrowing from 16 to 8 bits
160 2640 return vqmovn_high_u16(vqmovn_u16(res1), res2);
161 1320 }
162
163 2570 ScalarType scalar_path(ScalarType src) {
164 2570 return scale_value(src, scale_, shift_);
165 }
166
167 private:
168 static constexpr ScalarType FF = std::numeric_limits<uint8_t>::max();
169 // clang-format off
170 static constexpr uint8x16_t w0 = { 0, FF, FF, FF, 1, FF, FF, FF, 2, FF, FF, FF, 3, FF, FF, FF};
171 static constexpr uint8x16_t w1 = { 4, FF, FF, FF, 5, FF, FF, FF, 6, FF, FF, FF, 7, FF, FF, FF};
172 static constexpr uint8x16_t w2 = { 8, FF, FF, FF, 9, FF, FF, FF, 10, FF, FF, FF, 11, FF, FF, FF};
173 static constexpr uint8x16_t w3 = {12, FF, FF, FF, 13, FF, FF, FF, 14, FF, FF, FF, 15, FF, FF, FF};
174 // clang-format on
175
176 // Convert from uint32 to float32, scale and convert back with rounding
177 5280 inline uint32x4_t scale_shift(VectorType src) {
178 5280 float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src));
179 // scale + shift is done by MLA
180 10560 return vcvtnq_u32_f32(vmlaq_f32(vshift_, fx, vscale_));
181 5280 }
182
183 float32x4_t vscale_, vshift_;
184 }; // end of class ScaleUint8Calc<T>
185
186 252 kleidicv_error_t scale_with_precalculated_table_u8(
187 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
188 size_t width, size_t height, double scale, double shift,
189 const std::array<uint8_t, 256> &precalculated_table) {
190 252 Rectangle rect{width, height};
191 252 Rows<const uint8_t> src_rows{src, src_stride};
192 252 Rows<uint8_t> dst_rows{dst, dst_stride};
193 504 ScaleUint8Tbx operation(static_cast<float>(scale), static_cast<float>(shift),
194 252 precalculated_table.data());
195 252 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
196
197 252 return KLEIDICV_OK;
198 252 }
199
200 // Specialization for uint8_t to uint8_t
201 template <>
202 200 kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst,
203 size_t dst_stride, size_t width, size_t height,
204 double scale, double shift) {
205
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 196 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 196 times.
200 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
206
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 192 times.
196 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
207
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 188 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 184 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 184 times.
192 CHECK_IMAGE_SIZE(width, height);
208 // For smaller inputs, the full calculation is the faster
209
2/2
✓ Branch 0 taken 172 times.
✓ Branch 1 taken 12 times.
184 if (width * height < 675) { // empirical value
210 172 Rectangle rect{width, height};
211 172 Rows<const uint8_t> src_rows{src, src_stride};
212 172 Rows<uint8_t> dst_rows{dst, dst_stride};
213 344 ScaleUint8Calc operation(static_cast<float>(scale),
214 172 static_cast<float>(shift));
215 172 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
216 172 } else {
217 // For bigger inputs, it's faster to pre-calculate the table
218 // and map those values during the run
219 12 auto precalculated_table = precalculate_scale_table_u8(scale, shift);
220 24 return scale_with_precalculated_table_u8(src, src_stride, dst, dst_stride,
221 12 width, height, scale, shift,
222 precalculated_table);
223 12 }
224 172 return KLEIDICV_OK;
225 200 }
226
227 7936 static uint32x4_t scale_shift(uint32x4_t src, float scale, float shift) {
228 7936 float32x4_t fx = vcvtq_f32_u32(src);
229 7936 float32x4_t max = vdupq_n_f32(255.0F);
230 7936 float32x4_t min = vdupq_n_f32(0.0F);
231 7936 float32x4_t val = vmlaq_f32(vdupq_n_f32(shift), fx, vdupq_n_f32(scale));
232 15872 return vcvtnq_u32_f32(vmaxq_f32(min, vminq_f32(val, max)));
233 7936 }
234
235 124 std::array<uint8_t, 256> precalculate_scale_table_u8(double dscale,
236 double dshift) {
237 124 float scale = static_cast<float>(dscale);
238 124 float shift = static_cast<float>(dshift);
239 static constexpr size_t TableLength = 256;
240 124 std::array<uint8_t, TableLength> precalculated_table{};
241
242 124 uint32x4_t counter = {0, 1, 2, 3};
243 124 uint32x4_t four = vdupq_n_u32(4);
244
245
2/2
✓ Branch 0 taken 124 times.
✓ Branch 1 taken 1984 times.
2108 for (size_t i = 0; i < TableLength; i += 16) {
246 1984 uint32x4_t res11 = scale_shift(counter, scale, shift);
247 1984 counter = vaddq(counter, four);
248 1984 uint32x4_t res12 = scale_shift(counter, scale, shift);
249 1984 counter = vaddq(counter, four);
250 1984 uint32x4_t res21 = scale_shift(counter, scale, shift);
251 1984 counter = vaddq(counter, four);
252 1984 uint32x4_t res22 = scale_shift(counter, scale, shift);
253 1984 counter = vaddq(counter, four);
254
255 3968 uint16x8_t res1 =
256 1984 vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
257 3968 uint16x8_t res2 =
258 1984 vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
259 // Saturating narrowing from 16 to 8 bits
260 1984 uint8x16_t res = vqmovn_high_u16(vqmovn_u16(res1), res2);
261
262 1984 vst1q_u8(&precalculated_table[i], res);
263 1984 }
264 return precalculated_table;
265 124 }
266
267 // -----------------------------------------------------------------------
268 // Float implementation
269 // -----------------------------------------------------------------------
270
271 class AddFloat final : public UnrollTwice, public UnrollOnce {
272 public:
273 using ScalarType = float;
274 using VecTraits = neon::VecTraits<ScalarType>;
275 using VectorType = typename VecTraits::VectorType;
276
277 6 explicit AddFloat(float shift) : shift_{shift}, vshift_{vdupq_n_f32(shift)} {}
278
279 5031 VectorType vector_path(VectorType src) { return vaddq_f32(vshift_, src); }
280
281 13 ScalarType scalar_path(ScalarType src) const { return src + shift_; }
282
283 private:
284 float shift_;
285 float32x4_t vshift_;
286 }; // end of class AddFloat
287
288 class ScaleFloat final : public UnrollTwice, public UnrollOnce {
289 public:
290 using ScalarType = float;
291 using VecTraits = neon::VecTraits<ScalarType>;
292 using VectorType = typename VecTraits::VectorType;
293
294 155 ScaleFloat(float scale, float shift)
295 155 : scale_{scale},
296 155 shift_{shift},
297 155 vscale_{vdupq_n_f32(scale)},
298 155 vshift_{vdupq_n_f32(shift)} {}
299
300 6839 VectorType vector_path(VectorType src) const {
301 6839 return vfmaq_f32(vshift_, src, vscale_);
302 }
303
304 136 ScalarType scalar_path(ScalarType src) const {
305 136 return std::fma(src, scale_, shift_);
306 }
307
308 private:
309 float scale_, shift_;
310 float32x4_t vscale_, vshift_;
311 }; // end of class ScaleFloat
312
313 // Specialization for float to float
314 template <>
315 167 kleidicv_error_t scale(const float *src, size_t src_stride, float *dst,
316 size_t dst_stride, size_t width, size_t height,
317 double scale, double shift) {
318
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 165 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 165 times.
167 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
319
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 163 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 163 times.
165 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
320
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 162 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 161 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 161 times.
163 CHECK_IMAGE_SIZE(width, height);
321
322 161 Rectangle rect{width, height};
323 161 Rows<const float> src_rows{src, src_stride};
324 161 Rows<float> dst_rows{dst, dst_stride};
325
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 155 times.
161 if (scale == 1.0) {
326 6 AddFloat operation(static_cast<float>(shift));
327 6 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
328 6 } else {
329 155 ScaleFloat operation(static_cast<float>(scale), static_cast<float>(shift));
330 155 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
331 155 }
332 161 return KLEIDICV_OK;
333 167 }
334
335 // -----------------------------------------------------------------------
336 // Scale uint8 to float16
337 // -----------------------------------------------------------------------
338
339 class ScaleUint8ToFloat16 {
340 public:
341 using SrcType = uint8_t;
342 using SrcVecTraits = neon::VecTraits<SrcType>;
343 using SrcVectorType = typename SrcVecTraits::VectorType;
344 using SrcVector2Type = typename SrcVecTraits::Vector2Type;
345 using DstType = float16_t;
346 using DstVecTraits = neon::VecTraits<DstType>;
347 using DstVectorType = typename DstVecTraits::VectorType;
348 using DstVector2Type = typename DstVecTraits::Vector2Type;
349 using DstVector4Type = typename DstVecTraits::Vector4Type;
350
351 114 ScaleUint8ToFloat16(float scale, float shift)
352 114 : scale_{scale},
353 114 shift_{shift},
354 114 vscale_{vdupq_n_f32(scale)},
355 114 vshift_{vdupq_n_f32(shift)} {}
356
357 209 void process_row(size_t width, Columns<const SrcType> src,
358 Columns<DstType> dst) {
359 418 LoopUnroll{width, SrcVecTraits::num_lanes()}
360 677 .unroll_twice([&](size_t step) {
361 468 SrcVector2Type src_2vec;
362 468 SrcVecTraits::load(&src[0], src_2vec);
363 468 DstVector2Type dst_2vec1 = vector_path(src_2vec.val[0]);
364 468 DstVector2Type dst_2vec2 = vector_path(src_2vec.val[1]);
365 468 DstVector4Type dst_4vec = {
366 1872 dst_2vec1.val[0],
367 468 dst_2vec1.val[1],
368 468 dst_2vec2.val[0],
369 468 dst_2vec2.val[1],
370 };
371 468 DstVecTraits::store(dst_4vec, &dst[0]);
372 468 src += ptrdiff_t(step);
373 468 dst += ptrdiff_t(step);
374 468 })
375 361 .remaining([&](size_t length, size_t) {
376
2/2
✓ Branch 0 taken 152 times.
✓ Branch 1 taken 2191 times.
2343 for (ptrdiff_t index = 0; index < static_cast<ptrdiff_t>(length);
377 2191 ++index) {
378 2191 disable_loop_vectorization();
379 2191 dst[index] = static_cast<float16_t>(
380 2191 static_cast<float>(src[index]) * scale_ + shift_);
381 2191 }
382 152 });
383 209 }
384
385 private:
386 936 DstVector2Type vector_path(SrcVectorType src) {
387 // For scaling, uint8 values have to be converted to uint32
388 // i.e. create four vectors from one
389 936 float32x4_t res0 = scale_shift(vqtbl1q_u8(src, kW0));
390 936 float32x4_t res1 = scale_shift(vqtbl1q_u8(src, kW1));
391 936 float32x4_t res2 = scale_shift(vqtbl1q_u8(src, kW2));
392 936 float32x4_t res3 = scale_shift(vqtbl1q_u8(src, kW3));
393 // Convert from 32-bit to 16-bit
394 936 float16x4_t res16_0 = vcvt_f16_f32(res0);
395 936 float16x4_t res16_2 = vcvt_f16_f32(res2);
396 DstVector2Type res;
397 936 res.val[0] = vcvt_high_f16_f32(res16_0, res1);
398 936 res.val[1] = vcvt_high_f16_f32(res16_2, res3);
399 return res;
400 936 }
401
402 // Convert from uint32 to float32 and scale it
403 3744 inline float32x4_t scale_shift(SrcVectorType src) {
404 3744 float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src));
405 7488 return vmlaq_f32(vshift_, fx, vscale_);
406 3744 }
407
408 static constexpr SrcType kFF = std::numeric_limits<SrcType>::max();
409 // clang-format off
410 static constexpr uint8x16_t kW0 = { 0, kFF, kFF, kFF, 1, kFF, kFF, kFF, 2, kFF, kFF, kFF, 3, kFF, kFF, kFF};
411 static constexpr uint8x16_t kW1 = { 4, kFF, kFF, kFF, 5, kFF, kFF, kFF, 6, kFF, kFF, kFF, 7, kFF, kFF, kFF};
412 static constexpr uint8x16_t kW2 = { 8, kFF, kFF, kFF, 9, kFF, kFF, kFF, 10, kFF, kFF, kFF, 11, kFF, kFF, kFF};
413 static constexpr uint8x16_t kW3 = {12, kFF, kFF, kFF, 13, kFF, kFF, kFF, 14, kFF, kFF, kFF, 15, kFF, kFF, kFF};
414 // clang-format on
415
416 float scale_, shift_;
417 float32x4_t vscale_, vshift_;
418 }; // end of class ScaleUint8ToFloat16
419
420 // Specialization for uint8_t to float16_t
421 template <>
422 118 kleidicv_error_t scale(const uint8_t *src, size_t src_stride, float16_t *dst,
423 size_t dst_stride, size_t width, size_t height,
424 double scale, double shift) {
425
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 117 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 117 times.
118 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
426
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 116 times.
117 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
427
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 115 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 114 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 114 times.
116 CHECK_IMAGE_SIZE(width, height);
428
429 114 Rectangle rect{width, height};
430 114 Rows<const uint8_t> src_rows{src, src_stride};
431 114 Rows<float16_t> dst_rows{dst, dst_stride};
432 228 ScaleUint8ToFloat16 operation(static_cast<float>(scale),
433 114 static_cast<float>(shift));
434 114 zip_rows(operation, rect, src_rows, dst_rows);
435 114 return KLEIDICV_OK;
436 118 }
437
438 } // namespace kleidicv::neon
439