KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/arithmetics/scale_neon.cpp
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	142	142	100.0%
Functions:	20	20	100.0%
Branches:	36	36	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <climits>
    
      #include <cmath>
    
      #include <cstdint>
    
      #include <limits>
    
      #include "kleidicv/arithmetics/scale.h"
    
      #include "kleidicv/neon.h"
    
      #include "kleidicv/traits.h"
    
      namespace kleidicv::neon {
    
      // Scale algorithm: for each value in the source,
    
      //   dst[i] = src[i] * scale + shift   (floating point operation)
    
      //
    
      // Unsigned 8-bit implementation
    
      //
    
      // Since converting from uint8 to float32 and back takes more steps,
    
      // 'ScaleTbx' saves time by pre-calculating all 256 values and uses TBLs
    
      // and TBXs to map the values directly from uint8 to uint8:
    
      //   i: 0 to 255:    tbl[i] = i * scale + shift
    
      //
    
      // Since a single TBL intruction can map only 16 values, more TBX instructions
    
      // needed for the remaining 240 values. After the first TBL (that replaces
    
      // 0-15 values with indexed values from the table) 16 is subtracted from all
    
      // lanes in the source vector before the next TBX is done, so when indexing 0
    
      // to 15, actually 16 to 31 values are replaced from the original source vector.
    
      //
    
      // Example:
    
      //   scale = 1
    
      //   shift = 100
    
      // Initialization: (it also takes time, so for short inputs it's not used)
    
      //   tbl = [ 100, 101, 102, ..., 255, <100 times 255, it's saturated>]
    
      // Copy table to vector registers:
    
      //   t0  = [ 100, ..., 115 ]
    
      //   t1  = [ 116, ..., 131 ]
    
      //   t2  = [ 132, ..., 147 ]
    
      //   ...
    
      //   t15 = [ 255, ..., 255 ]
    
      //
    
      //   input:    v = [  21,   3,  39,   6 ]
    
      //   TBL(t0):  d = [   0, 103,   0, 106 ] // index > 16 result in 0
    
      //   SUB:      v = [   5, 243,  23, 246 ] // subtracted 16 --> next table
    
      //   TBX(t1):  d = [ 121, 103,   0, 106 ] // index > 16 are ignored
    
      //   SUB:      v = [ 245, 227,   7, 230 ] // subtracted 16 --> next table
    
      //   TBX(t2):  d = [ 121, 103, 107, 106 ] // index > 16 are ignored
    
      //   ... etc.
    
      //
    
      // Bigger index tables (32, 48 or 64 values) can be used by TBX2 - TBX3 - TBX4.
    
      // In this case, instead of 16, 2/3/4 * 16 have to be subtracted from source.
    
      // The below solution (combining TBX2-TBX3) gives a good compromise between code
    
      // size and speed.
    
      template <typename ScalarType>
    
      class ScaleIntBase : public UnrollTwice {
    
       public:
    
      303
        ScaleIntBase(float scale, float shift) : scale_{scale}, shift_{shift} {}
    
       protected:
    
        static constexpr ScalarType ScalarMax =
    
            std::numeric_limits<ScalarType>::max();
    
        float scale_, shift_;
    
      };
    
      template <typename T>
    
      kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
    
                             size_t dst_stride, size_t width, size_t height,
    
                             float scale, float shift);
    
      template <typename T>
    
      2713
      T scale_value(T value, float scale, float shift) {
    
        static constexpr T ScalarMax = std::numeric_limits<T>::max();
    
      2713
        int64_t v = lrintf(static_cast<float>(value) * scale + shift);
    
        2/2✓ Branch 0 taken 2428 times.
✓ Branch 1 taken 285 times.

      2713
        if (static_cast<uint64_t>(v) <= ScalarMax) {
    
      2428
          return static_cast<T>(v);
    
        }
    
      285
        return static_cast<T>(v > 0 ? ScalarMax : 0);
    
      2713
      }
    
      class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
        using Vector2Type = typename VecTraits::Vector2Type;
    
        using Vector3Type = typename VecTraits::Vector3Type;
    
      149
        ScaleUint8Tbx(float scale, float shift, const ScalarType *precalculated_table)
    
      149
            : ScaleIntBase<uint8_t>(scale, shift),
    
      149
              table_pointer_(precalculated_table),
    
      149
              v_step3_(vdupq_n_u8(3 * VecTraits::num_lanes())),
    
      149
              v_step2_(vdupq_n_u8(2 * VecTraits::num_lanes())) {
    
      149
          VecTraits::load(precalculated_table, t0_3_);
    
      149
          VecTraits::load(precalculated_table + 3 * VecTraits::num_lanes(), t1_3_);
    
      298
          VecTraits::load(precalculated_table + (3 + 3) * VecTraits::num_lanes(),
    
      149
                          t2_2_);
    
      298
          VecTraits::load(precalculated_table + (3 + 3 + 2) * VecTraits::num_lanes(),
    
      149
                          t3_3_);
    
      149
          VecTraits::load(
    
      149
              precalculated_table + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_);
    
      149
          VecTraits::load(
    
      149
              precalculated_table + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(),
    
      149
              t5_3_);
    
      149
        }
    
      2400
        VectorType vector_path(VectorType src) {
    
      2400
          VectorType dst = vqtbl3q_u8(t0_3_, src);
    
      2400
          src = vsubq_u8(src, v_step3_);
    
      2400
          dst = vqtbx3q_u8(dst, t1_3_, src);
    
      2400
          src = vsubq_u8(src, v_step3_);
    
      2400
          dst = vqtbx2q_u8(dst, t2_2_, src);
    
      2400
          src = vsubq_u8(src, v_step2_);
    
      2400
          dst = vqtbx3q_u8(dst, t3_3_, src);
    
      2400
          src = vsubq_u8(src, v_step3_);
    
      2400
          dst = vqtbx2q_u8(dst, t4_2_, src);
    
      2400
          src = vsubq_u8(src, v_step2_);
    
      2400
          dst = vqtbx3q_u8(dst, t5_3_, src);
    
      4800
          return dst;
    
      2400
        }
    
      2816
        ScalarType scalar_path(ScalarType src) { return table_pointer_[src]; }
    
       private:
    
        const ScalarType *table_pointer_;
    
      149
        Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{};
    
      149
        Vector2Type t2_2_{}, t4_2_{};
    
        VectorType v_step3_, v_step2_;
    
      };  // end of class ScaleUint8Tbx<T>
    
      // Opposite to ScaleUint8Tbx, ScaleUint8Calc is the direct approach:
    
      // - calculate dst[i] = src[i] * scale + shift  using vector instructions
    
      class ScaleUint8Calc final : public ScaleIntBase<uint8_t> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
      154
        ScaleUint8Calc(float scale, float shift)
    
      154
            : ScaleIntBase<ScalarType>(scale, shift),
    
      154
              vscale_{vdupq_n_f32(scale)},
    
      154
              vshift_{vdupq_n_f32(shift)} {}
    
      1294
        VectorType vector_path(VectorType src) {
    
          // For scaling, uint8 values have to be converted to uint32
    
          // i.e. create four vectors from one
    
      1294
          uint32x4_t res11 = scale_shift(vqtbl1q_u8(src, w0));
    
      1294
          uint32x4_t res12 = scale_shift(vqtbl1q_u8(src, w1));
    
      1294
          uint32x4_t res21 = scale_shift(vqtbl1q_u8(src, w2));
    
      1294
          uint32x4_t res22 = scale_shift(vqtbl1q_u8(src, w3));
    
          // Convert back from 32-bit: top two bytes are 0 for sure, unzip them
    
      2588
          uint16x8_t res1 =
    
      1294
              vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
    
      2588
          uint16x8_t res2 =
    
      1294
              vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
    
          // Saturating narrowing from 16 to 8 bits
    
      2588
          return vqmovn_high_u16(vqmovn_u16(res1), res2);
    
      1294
        }
    
      2713
        ScalarType scalar_path(ScalarType src) {
    
      2713
          return scale_value(src, scale_, shift_);
    
        }
    
       private:
    
        static constexpr ScalarType FF = std::numeric_limits<uint8_t>::max();
    
        // clang-format off
    
        static constexpr uint8x16_t w0 = { 0, FF, FF, FF,  1, FF, FF, FF,  2, FF, FF, FF,  3, FF, FF, FF};
    
        static constexpr uint8x16_t w1 = { 4, FF, FF, FF,  5, FF, FF, FF,  6, FF, FF, FF,  7, FF, FF, FF};
    
        static constexpr uint8x16_t w2 = { 8, FF, FF, FF,  9, FF, FF, FF, 10, FF, FF, FF, 11, FF, FF, FF};
    
        static constexpr uint8x16_t w3 = {12, FF, FF, FF, 13, FF, FF, FF, 14, FF, FF, FF, 15, FF, FF, FF};
    
        // clang-format on
    
        // Convert from uint32 to float32, scale and convert back with rounding
    
      5176
        inline uint32x4_t scale_shift(VectorType src) {
    
      5176
          float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src));
    
          // scale + shift is done by MLA
    
      10352
          return vcvtnq_u32_f32(vmlaq_f32(vshift_, fx, vscale_));
    
      5176
        }
    
        float32x4_t vscale_, vshift_;
    
      };  // end of class ScaleUint8Calc<T>
    
      149
      kleidicv_error_t scale_with_precalculated_table(
    
          const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
    
          size_t width, size_t height, float scale, float shift,
    
          const std::array<uint8_t, 256> &precalculated_table) {
    
      149
        Rectangle rect{width, height};
    
      149
        Rows<const uint8_t> src_rows{src, src_stride};
    
      149
        Rows<uint8_t> dst_rows{dst, dst_stride};
    
      149
        ScaleUint8Tbx operation(scale, shift, precalculated_table.data());
    
      149
        apply_operation_by_rows(operation, rect, src_rows, dst_rows);
    
      149
        return KLEIDICV_OK;
    
      149
      }
    
      // Specialization for uint8_t
    
      template <>
    
      198
      kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst,
    
                             size_t dst_stride, size_t width, size_t height,
    
                             float scale, float shift) {
    
        4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 195 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 195 times.

      198
        CHECK_POINTER_AND_STRIDE(src, src_stride, height);
    
        4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 192 times.

      195
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
    
        6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 189 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 186 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 186 times.

      192
        CHECK_IMAGE_SIZE(width, height);
    
        // For smaller inputs, the full calculation is the faster
    
        2/2✓ Branch 0 taken 154 times.
✓ Branch 1 taken 32 times.

      186
        if (width * height < 675) {  // empirical value
    
      154
          Rectangle rect{width, height};
    
      154
          Rows<const uint8_t> src_rows{src, src_stride};
    
      154
          Rows<uint8_t> dst_rows{dst, dst_stride};
    
      154
          ScaleUint8Calc operation(scale, shift);
    
      154
          apply_operation_by_rows(operation, rect, src_rows, dst_rows);
    
      154
        } else {
    
          // For bigger inputs, it's faster to pre-calculate the table
    
          // and map those values during the run
    
      32
          auto precalculated_table = precalculate_scale_table_u8(scale, shift);
    
      64
          return scale_with_precalculated_table(src, src_stride, dst, dst_stride,
    
      32
                                                width, height, scale, shift,
    
                                                precalculated_table);
    
      32
        }
    
      154
        return KLEIDICV_OK;
    
      198
      }
    
      7424
      static uint32x4_t scale_shift(uint32x4_t src, float scale, float shift) {
    
      7424
        float32x4_t fx = vcvtq_f32_u32(src);
    
      7424
        float32x4_t max = vdupq_n_f32(255.0F);
    
      7424
        float32x4_t min = vdupq_n_f32(0.0F);
    
      7424
        float32x4_t val = vmlaq_f32(vdupq_n_f32(shift), fx, vdupq_n_f32(scale));
    
      14848
        return vcvtnq_u32_f32(vmaxq_f32(min, vminq_f32(val, max)));
    
      7424
      }
    
      116
      std::array<uint8_t, 256> precalculate_scale_table_u8(float scale, float shift) {
    
        static constexpr size_t TableLength = 256;
    
      116
        std::array<uint8_t, TableLength> precalculated_table{};
    
      116
        uint32x4_t counter = {0, 1, 2, 3};
    
      116
        uint32x4_t four = vdupq_n_u32(4);
    
        2/2✓ Branch 0 taken 116 times.
✓ Branch 1 taken 1856 times.

      1972
        for (size_t i = 0; i < TableLength; i += 16) {
    
      1856
          uint32x4_t res11 = scale_shift(counter, scale, shift);
    
      1856
          counter = vaddq(counter, four);
    
      1856
          uint32x4_t res12 = scale_shift(counter, scale, shift);
    
      1856
          counter = vaddq(counter, four);
    
      1856
          uint32x4_t res21 = scale_shift(counter, scale, shift);
    
      1856
          counter = vaddq(counter, four);
    
      1856
          uint32x4_t res22 = scale_shift(counter, scale, shift);
    
      1856
          counter = vaddq(counter, four);
    
      3712
          uint16x8_t res1 =
    
      1856
              vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
    
      3712
          uint16x8_t res2 =
    
      1856
              vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
    
          // Saturating narrowing from 16 to 8 bits
    
      1856
          uint8x16_t res = vqmovn_high_u16(vqmovn_u16(res1), res2);
    
      1856
          vst1q_u8(&precalculated_table[i], res);
    
      1856
        }
    
        return precalculated_table;
    
      116
      }
    
      // -----------------------------------------------------------------------
    
      // Float implementation
    
      // -----------------------------------------------------------------------
    
      class AddFloat final : public UnrollTwice, public UnrollOnce {
    
       public:
    
        using ScalarType = float;
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
      6
        explicit AddFloat(float shift) : shift_{shift}, vshift_{vdupq_n_f32(shift)} {}
    
      5034
        VectorType vector_path(VectorType src) { return vaddq_f32(vshift_, src); }
    
        // NOLINTBEGIN(readability-make-member-function-const)
    
      12
        ScalarType scalar_path(ScalarType src) { return src + shift_; }
    
        // NOLINTEND(readability-make-member-function-const)
    
       private:
    
        float shift_;
    
        float32x4_t vshift_;
    
      };  // end of class AddFloat
    
      class ScaleFloat final : public UnrollTwice, public UnrollOnce {
    
       public:
    
        using ScalarType = float;
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
      101
        ScaleFloat(float scale, float shift)
    
      101
            : scale_{scale},
    
      101
              shift_{shift},
    
      101
              vscale_{vdupq_n_f32(scale)},
    
      101
              vshift_{vdupq_n_f32(shift)} {}
    
      45897
        VectorType vector_path(VectorType src) {
    
      45897
          return vmlaq_f32(vshift_, src, vscale_);
    
        }
    
        // NOLINTBEGIN(readability-make-member-function-const)
    
      4083
        ScalarType scalar_path(ScalarType src) { return src * scale_ + shift_; }
    
        // NOLINTEND(readability-make-member-function-const)
    
       private:
    
        float scale_, shift_;
    
        float32x4_t vscale_, vshift_;
    
      };  // end of class ScaleFloat
    
      // Specialization for float
    
      template <>
    
      113
      kleidicv_error_t scale(const float *src, size_t src_stride, float *dst,
    
                             size_t dst_stride, size_t width, size_t height,
    
                             float scale, float shift) {
    
        4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 111 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 111 times.

      113
        CHECK_POINTER_AND_STRIDE(src, src_stride, height);
    
        4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 109 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 109 times.

      111
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
    
        6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 108 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 107 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 107 times.

      109
        CHECK_IMAGE_SIZE(width, height);
    
      107
        Rectangle rect{width, height};
    
      107
        Rows<const float> src_rows{src, src_stride};
    
      107
        Rows<float> dst_rows{dst, dst_stride};
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 101 times.

      107
        if (scale == 1.0) {
    
      6
          AddFloat operation(shift);
    
      6
          apply_operation_by_rows(operation, rect, src_rows, dst_rows);
    
      6
        } else {
    
      101
          ScaleFloat operation(scale, shift);
    
      101
          apply_operation_by_rows(operation, rect, src_rows, dst_rows);
    
      101
        }
    
      107
        return KLEIDICV_OK;
    
      113
      }
    
      }  //  namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include <climits>
6			#include <cmath>
7			#include <cstdint>
8			#include <limits>
9
10			#include "kleidicv/arithmetics/scale.h"
11			#include "kleidicv/neon.h"
12			#include "kleidicv/traits.h"
13
14			namespace kleidicv::neon {
15
16			// Scale algorithm: for each value in the source,
17			// dst[i] = src[i] * scale + shift (floating point operation)
18			//
19			// Unsigned 8-bit implementation
20			//
21			// Since converting from uint8 to float32 and back takes more steps,
22			// 'ScaleTbx' saves time by pre-calculating all 256 values and uses TBLs
23			// and TBXs to map the values directly from uint8 to uint8:
24			// i: 0 to 255: tbl[i] = i * scale + shift
25			//
26			// Since a single TBL intruction can map only 16 values, more TBX instructions
27			// needed for the remaining 240 values. After the first TBL (that replaces
28			// 0-15 values with indexed values from the table) 16 is subtracted from all
29			// lanes in the source vector before the next TBX is done, so when indexing 0
30			// to 15, actually 16 to 31 values are replaced from the original source vector.
31			//
32			// Example:
33			// scale = 1
34			// shift = 100
35			// Initialization: (it also takes time, so for short inputs it's not used)
36			// tbl = [ 100, 101, 102, ..., 255, <100 times 255, it's saturated>]
37			// Copy table to vector registers:
38			// t0 = [ 100, ..., 115 ]
39			// t1 = [ 116, ..., 131 ]
40			// t2 = [ 132, ..., 147 ]
41			// ...
42			// t15 = [ 255, ..., 255 ]
43			//
44			// input: v = [ 21, 3, 39, 6 ]
45			// TBL(t0): d = [ 0, 103, 0, 106 ] // index > 16 result in 0
46			// SUB: v = [ 5, 243, 23, 246 ] // subtracted 16 --> next table
47			// TBX(t1): d = [ 121, 103, 0, 106 ] // index > 16 are ignored
48			// SUB: v = [ 245, 227, 7, 230 ] // subtracted 16 --> next table
49			// TBX(t2): d = [ 121, 103, 107, 106 ] // index > 16 are ignored
50			// ... etc.
51			//
52			// Bigger index tables (32, 48 or 64 values) can be used by TBX2 - TBX3 - TBX4.
53			// In this case, instead of 16, 2/3/4 * 16 have to be subtracted from source.
54			// The below solution (combining TBX2-TBX3) gives a good compromise between code
55			// size and speed.
56
57			template <typename ScalarType>
58			class ScaleIntBase : public UnrollTwice {
59			public:
60		303	ScaleIntBase(float scale, float shift) : scale_{scale}, shift_{shift} {}
61
62			protected:
63			static constexpr ScalarType ScalarMax =
64			std::numeric_limits<ScalarType>::max();
65
66			float scale_, shift_;
67			};
68
69			template <typename T>
70			kleidicv_error_t scale(const T src, size_t src_stride, T dst,
71			size_t dst_stride, size_t width, size_t height,
72			float scale, float shift);
73
74			template <typename T>
75		2713	T scale_value(T value, float scale, float shift) {
76			static constexpr T ScalarMax = std::numeric_limits<T>::max();
77		2713	int64_t v = lrintf(static_cast<float>(value) * scale + shift);
78	2/2 ✓ Branch 0 taken 2428 times. ✓ Branch 1 taken 285 times.	2713	if (static_cast<uint64_t>(v) <= ScalarMax) {
79		2428	return static_cast<T>(v);
80			}
81		285	return static_cast<T>(v > 0 ? ScalarMax : 0);
82		2713	}
83
84			class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
85			public:
86			using ScalarType = uint8_t;
87			using VecTraits = neon::VecTraits<ScalarType>;
88			using VectorType = typename VecTraits::VectorType;
89			using Vector2Type = typename VecTraits::Vector2Type;
90			using Vector3Type = typename VecTraits::Vector3Type;
91
92		149	ScaleUint8Tbx(float scale, float shift, const ScalarType *precalculated_table)
93		149	: ScaleIntBase<uint8_t>(scale, shift),
94		149	table_pointer_(precalculated_table),
95		149	v_step3_(vdupq_n_u8(3 * VecTraits::num_lanes())),
96		149	v_step2_(vdupq_n_u8(2 * VecTraits::num_lanes())) {
97		149	VecTraits::load(precalculated_table, t0_3_);
98		149	VecTraits::load(precalculated_table + 3 * VecTraits::num_lanes(), t1_3_);
99		298	VecTraits::load(precalculated_table + (3 + 3) * VecTraits::num_lanes(),
100		149	t2_2_);
101		298	VecTraits::load(precalculated_table + (3 + 3 + 2) * VecTraits::num_lanes(),
102		149	t3_3_);
103		149	VecTraits::load(
104		149	precalculated_table + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_);
105		149	VecTraits::load(
106		149	precalculated_table + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(),
107		149	t5_3_);
108		149	}
109		2400	VectorType vector_path(VectorType src) {
110		2400	VectorType dst = vqtbl3q_u8(t0_3_, src);
111		2400	src = vsubq_u8(src, v_step3_);
112		2400	dst = vqtbx3q_u8(dst, t1_3_, src);
113		2400	src = vsubq_u8(src, v_step3_);
114		2400	dst = vqtbx2q_u8(dst, t2_2_, src);
115		2400	src = vsubq_u8(src, v_step2_);
116		2400	dst = vqtbx3q_u8(dst, t3_3_, src);
117		2400	src = vsubq_u8(src, v_step3_);
118		2400	dst = vqtbx2q_u8(dst, t4_2_, src);
119		2400	src = vsubq_u8(src, v_step2_);
120		2400	dst = vqtbx3q_u8(dst, t5_3_, src);
121		4800	return dst;
122		2400	}
123
124		2816	ScalarType scalar_path(ScalarType src) { return table_pointer_[src]; }
125
126			private:
127			const ScalarType *table_pointer_;
128		149	Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{};
129		149	Vector2Type t2_2_{}, t4_2_{};
130			VectorType v_step3_, v_step2_;
131			}; // end of class ScaleUint8Tbx<T>
132
133			// Opposite to ScaleUint8Tbx, ScaleUint8Calc is the direct approach:
134			// - calculate dst[i] = src[i] * scale + shift using vector instructions
135			class ScaleUint8Calc final : public ScaleIntBase<uint8_t> {
136			public:
137			using ScalarType = uint8_t;
138			using VecTraits = neon::VecTraits<ScalarType>;
139			using VectorType = typename VecTraits::VectorType;
140
141		154	ScaleUint8Calc(float scale, float shift)
142		154	: ScaleIntBase<ScalarType>(scale, shift),
143		154	vscale_{vdupq_n_f32(scale)},
144		154	vshift_{vdupq_n_f32(shift)} {}
145
146		1294	VectorType vector_path(VectorType src) {
147			// For scaling, uint8 values have to be converted to uint32
148			// i.e. create four vectors from one
149		1294	uint32x4_t res11 = scale_shift(vqtbl1q_u8(src, w0));
150		1294	uint32x4_t res12 = scale_shift(vqtbl1q_u8(src, w1));
151		1294	uint32x4_t res21 = scale_shift(vqtbl1q_u8(src, w2));
152		1294	uint32x4_t res22 = scale_shift(vqtbl1q_u8(src, w3));
153			// Convert back from 32-bit: top two bytes are 0 for sure, unzip them
154		2588	uint16x8_t res1 =
155		1294	vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
156		2588	uint16x8_t res2 =
157		1294	vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
158
159			// Saturating narrowing from 16 to 8 bits
160		2588	return vqmovn_high_u16(vqmovn_u16(res1), res2);
161		1294	}
162
163		2713	ScalarType scalar_path(ScalarType src) {
164		2713	return scale_value(src, scale_, shift_);
165			}
166
167			private:
168			static constexpr ScalarType FF = std::numeric_limits<uint8_t>::max();
169			// clang-format off
170			static constexpr uint8x16_t w0 = { 0, FF, FF, FF, 1, FF, FF, FF, 2, FF, FF, FF, 3, FF, FF, FF};
171			static constexpr uint8x16_t w1 = { 4, FF, FF, FF, 5, FF, FF, FF, 6, FF, FF, FF, 7, FF, FF, FF};
172			static constexpr uint8x16_t w2 = { 8, FF, FF, FF, 9, FF, FF, FF, 10, FF, FF, FF, 11, FF, FF, FF};
173			static constexpr uint8x16_t w3 = {12, FF, FF, FF, 13, FF, FF, FF, 14, FF, FF, FF, 15, FF, FF, FF};
174			// clang-format on
175
176			// Convert from uint32 to float32, scale and convert back with rounding
177		5176	inline uint32x4_t scale_shift(VectorType src) {
178		5176	float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src));
179			// scale + shift is done by MLA
180		10352	return vcvtnq_u32_f32(vmlaq_f32(vshift_, fx, vscale_));
181		5176	}
182
183			float32x4_t vscale_, vshift_;
184			}; // end of class ScaleUint8Calc<T>
185
186		149	kleidicv_error_t scale_with_precalculated_table(
187			const uint8_t src, size_t src_stride, uint8_t dst, size_t dst_stride,
188			size_t width, size_t height, float scale, float shift,
189			const std::array<uint8_t, 256> &precalculated_table) {
190		149	Rectangle rect{width, height};
191		149	Rows<const uint8_t> src_rows{src, src_stride};
192		149	Rows<uint8_t> dst_rows{dst, dst_stride};
193		149	ScaleUint8Tbx operation(scale, shift, precalculated_table.data());
194		149	apply_operation_by_rows(operation, rect, src_rows, dst_rows);
195
196		149	return KLEIDICV_OK;
197		149	}
198
199			// Specialization for uint8_t
200			template <>
201		198	kleidicv_error_t scale(const uint8_t src, size_t src_stride, uint8_t dst,
202			size_t dst_stride, size_t width, size_t height,
203			float scale, float shift) {
204	4/4 ✓ Branch 0 taken 3 times. ✓ Branch 1 taken 195 times. ✓ Branch 2 taken 3 times. ✓ Branch 3 taken 195 times.	198	CHECK_POINTER_AND_STRIDE(src, src_stride, height);
205	4/4 ✓ Branch 0 taken 3 times. ✓ Branch 1 taken 192 times. ✓ Branch 2 taken 3 times. ✓ Branch 3 taken 192 times.	195	CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
206	6/6 ✓ Branch 0 taken 3 times. ✓ Branch 1 taken 189 times. ✓ Branch 2 taken 3 times. ✓ Branch 3 taken 186 times. ✓ Branch 4 taken 6 times. ✓ Branch 5 taken 186 times.	192	CHECK_IMAGE_SIZE(width, height);
207			// For smaller inputs, the full calculation is the faster
208	2/2 ✓ Branch 0 taken 154 times. ✓ Branch 1 taken 32 times.	186	if (width * height < 675) { // empirical value
209		154	Rectangle rect{width, height};
210		154	Rows<const uint8_t> src_rows{src, src_stride};
211		154	Rows<uint8_t> dst_rows{dst, dst_stride};
212		154	ScaleUint8Calc operation(scale, shift);
213		154	apply_operation_by_rows(operation, rect, src_rows, dst_rows);
214		154	} else {
215			// For bigger inputs, it's faster to pre-calculate the table
216			// and map those values during the run
217		32	auto precalculated_table = precalculate_scale_table_u8(scale, shift);
218		64	return scale_with_precalculated_table(src, src_stride, dst, dst_stride,
219		32	width, height, scale, shift,
220			precalculated_table);
221		32	}
222		154	return KLEIDICV_OK;
223		198	}
224
225		7424	static uint32x4_t scale_shift(uint32x4_t src, float scale, float shift) {
226		7424	float32x4_t fx = vcvtq_f32_u32(src);
227		7424	float32x4_t max = vdupq_n_f32(255.0F);
228		7424	float32x4_t min = vdupq_n_f32(0.0F);
229		7424	float32x4_t val = vmlaq_f32(vdupq_n_f32(shift), fx, vdupq_n_f32(scale));
230		14848	return vcvtnq_u32_f32(vmaxq_f32(min, vminq_f32(val, max)));
231		7424	}
232
233		116	std::array<uint8_t, 256> precalculate_scale_table_u8(float scale, float shift) {
234			static constexpr size_t TableLength = 256;
235		116	std::array<uint8_t, TableLength> precalculated_table{};
236
237		116	uint32x4_t counter = {0, 1, 2, 3};
238		116	uint32x4_t four = vdupq_n_u32(4);
239
240	2/2 ✓ Branch 0 taken 116 times. ✓ Branch 1 taken 1856 times.	1972	for (size_t i = 0; i < TableLength; i += 16) {
241		1856	uint32x4_t res11 = scale_shift(counter, scale, shift);
242		1856	counter = vaddq(counter, four);
243		1856	uint32x4_t res12 = scale_shift(counter, scale, shift);
244		1856	counter = vaddq(counter, four);
245		1856	uint32x4_t res21 = scale_shift(counter, scale, shift);
246		1856	counter = vaddq(counter, four);
247		1856	uint32x4_t res22 = scale_shift(counter, scale, shift);
248		1856	counter = vaddq(counter, four);
249
250		3712	uint16x8_t res1 =
251		1856	vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
252		3712	uint16x8_t res2 =
253		1856	vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
254			// Saturating narrowing from 16 to 8 bits
255		1856	uint8x16_t res = vqmovn_high_u16(vqmovn_u16(res1), res2);
256
257		1856	vst1q_u8(&precalculated_table[i], res);
258		1856	}
259			return precalculated_table;
260		116	}
261
262			// -----------------------------------------------------------------------
263			// Float implementation
264			// -----------------------------------------------------------------------
265
266			class AddFloat final : public UnrollTwice, public UnrollOnce {
267			public:
268			using ScalarType = float;
269			using VecTraits = neon::VecTraits<ScalarType>;
270			using VectorType = typename VecTraits::VectorType;
271
272		6	explicit AddFloat(float shift) : shift_{shift}, vshift_{vdupq_n_f32(shift)} {}
273
274		5034	VectorType vector_path(VectorType src) { return vaddq_f32(vshift_, src); }
275
276			// NOLINTBEGIN(readability-make-member-function-const)
277		12	ScalarType scalar_path(ScalarType src) { return src + shift_; }
278			// NOLINTEND(readability-make-member-function-const)
279
280			private:
281			float shift_;
282			float32x4_t vshift_;
283			}; // end of class AddFloat
284
285			class ScaleFloat final : public UnrollTwice, public UnrollOnce {
286			public:
287			using ScalarType = float;
288			using VecTraits = neon::VecTraits<ScalarType>;
289			using VectorType = typename VecTraits::VectorType;
290
291		101	ScaleFloat(float scale, float shift)
292		101	: scale_{scale},
293		101	shift_{shift},
294		101	vscale_{vdupq_n_f32(scale)},
295		101	vshift_{vdupq_n_f32(shift)} {}
296
297		45897	VectorType vector_path(VectorType src) {
298		45897	return vmlaq_f32(vshift_, src, vscale_);
299			}
300
301			// NOLINTBEGIN(readability-make-member-function-const)
302		4083	ScalarType scalar_path(ScalarType src) { return src * scale_ + shift_; }
303			// NOLINTEND(readability-make-member-function-const)
304
305			private:
306			float scale_, shift_;
307			float32x4_t vscale_, vshift_;
308			}; // end of class ScaleFloat
309
310			// Specialization for float
311			template <>
312		113	kleidicv_error_t scale(const float src, size_t src_stride, float dst,
313			size_t dst_stride, size_t width, size_t height,
314			float scale, float shift) {
315	4/4 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 111 times. ✓ Branch 2 taken 2 times. ✓ Branch 3 taken 111 times.	113	CHECK_POINTER_AND_STRIDE(src, src_stride, height);
316	4/4 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 109 times. ✓ Branch 2 taken 2 times. ✓ Branch 3 taken 109 times.	111	CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
317	6/6 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 108 times. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 107 times. ✓ Branch 4 taken 2 times. ✓ Branch 5 taken 107 times.	109	CHECK_IMAGE_SIZE(width, height);
318
319		107	Rectangle rect{width, height};
320		107	Rows<const float> src_rows{src, src_stride};
321		107	Rows<float> dst_rows{dst, dst_stride};
322	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 101 times.	107	if (scale == 1.0) {
323		6	AddFloat operation(shift);
324		6	apply_operation_by_rows(operation, rect, src_rows, dst_rows);
325		6	} else {
326		101	ScaleFloat operation(scale, shift);
327		101	apply_operation_by_rows(operation, rect, src_rows, dst_rows);
328		101	}
329		107	return KLEIDICV_OK;
330		113	}
331
332			} // namespace kleidicv::neon
333