KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/resize/resize_linear_generic_sc.h
Date:	2026-03-05 15:57:40
	Exec	Total	Coverage
Lines:	375	375	100.0%
Functions:	552	552	100.0%
Branches:	128	128	100.0%
  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #ifndef KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H
    
      #define KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H
    
      #include <algorithm>
    
      #include <cstddef>
    
      #include <memory>
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/sve2.h"
    
      namespace KLEIDICV_TARGET_NAMESPACE {
    
      //------------------------------------------------------
    
      /// Generic resize for ratios 1/3 to 1/1, u8, 1channel
    
      //------------------------------------------------------
    
      namespace resize_generic_u8 {
    
      // For the coordinate calculation, fixed-point format is used, for better
    
      // performance. Fixed-point format:
    
      // - lowest 16 bits are the fractional part, that is the kFixpBits constant
    
      // - at interpolation, the high 8 bits are used from the fractional part
    
      //   (this is a good compromise between accuracy and performance: because the
    
      //   result is 8bits, the error only affects the least significant 1-2 bits, see
    
      //   the accuracy calculation in kleidicv.h
    
      // - to get the integer part, right shift by 16 bits, or zip/unzip/tbl etc. to
    
      //   get the bytes needed
    
      // - for better accuracy, rounding is needed everywhere, i.e. adding 0.5, which
    
      //   is 1 << 15
    
      static constexpr ptrdiff_t kFixpBits = 16;
    
      static constexpr ptrdiff_t kFixpHalf = (1UL << (kFixpBits - 1));
    
      // Precalc 1 item:
    
      // Frac:       2 vectors u16
    
      // Idx:        1 vector   u8 (left_idx)
    
      // Src_index:  uint64 (separate array)
    
      template <size_t kRatio>
    
      struct PrecalcIterator {
    
        size_t index_;
    
        uint64_t *src_index_ptr_;
    
        const size_t kStep, kIdxFracStep;
    
        uint8_t *idx_ptr_;
    
        uint16_t *frac_ptr_;
    
      5267
        PrecalcIterator(size_t kStepDst, uint64_t *src_indices,
    
                        uint8_t *p_idx_frac) KLEIDICV_STREAMING
    
      5267
            : index_{0},
    
      5267
              src_index_ptr_{src_indices},
    
      5267
              kStep{kStepDst},
    
      5267
              kIdxFracStep{kStep * (2 + 1)},
    
      5267
              idx_ptr_{p_idx_frac},
    
      5267
              frac_ptr_{reinterpret_cast<uint16_t *>(p_idx_frac + kStep)} {}
    
      769817
        PrecalcIterator &operator++() KLEIDICV_STREAMING {
    
      769817
          ++index_;
    
      769817
          ++src_index_ptr_;
    
      769817
          idx_ptr_ += kIdxFracStep;
    
      769817
          frac_ptr_ += kIdxFracStep / 2;
    
      769817
          return *this;
    
        }
    
      };
    
      template <ptrdiff_t kRatio, ptrdiff_t kChannels>
    
      class PrecalcIndicesFractions final {
    
       public:
    
      457
        PrecalcIndicesFractions(size_t src_width, size_t dst_width,
    
                                ptrdiff_t kStep) KLEIDICV_STREAMING
    
      457
            : src_width_{src_width},
    
      457
              dst_width_{dst_width},
    
      457
              n_iterations_{0},
    
      457
              n_iterations_2x_{0},
    
      457
              kStep_{kStep},
    
      457
              precalc_src_bases_{nullptr, &std::free},
    
      457
              precalc_idx_frac_{nullptr, &std::free} {}
    
      5267
        PrecalcIterator<kRatio> begin() const KLEIDICV_STREAMING {
    
      10534
          return PrecalcIterator<kRatio>(kStep_, precalc_src_bases_.get(),
    
      5267
                                         precalc_idx_frac_.get());
    
        }
    
      318
        bool precalculate_indices_fractions_srcindices() KLEIDICV_STREAMING {
    
        8/8✓ Branch 0 taken 72 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 81 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 72 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 81 times.
✓ Branch 7 taken 3 times.

      318
          if (!allocate_temp_buffers()) {
    
      12
            return false;
    
          }
    
          // These starting values are not aligned to center. The center alignment
    
          // must be added only once. When added to a center-aligned source_x
    
          // value, the result will be center-aligned.
    
      306
          svuint32_t vsx0b = make_vsx0(0);
    
      306
          svuint32_t vsx0t = make_vsx0(1);
    
      306
          svuint32_t vsx1b = make_vsx0(2 * svcntw());
    
      306
          svuint32_t vsx1t = make_vsx0(2 * svcntw() + 1);
    
          // from each even 16bit element, take the low byte, and the high is 0
    
      612
          svuint8_t vsxfrac_bottom_tbl =
    
      306
              svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004));
    
          // from each odd 16bit element, take the low byte, and the high is 0
    
      612
          svuint8_t vsxfrac_top_tbl =
    
      306
              svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004));
    
      612
          svuint8_t vchannels = svreinterpret_u8_u32(
    
      306
              svdup_n_u32(kChannels == 4 ? 0x03020100U : 0x01000100));
    
          // Difference in source x coordinate, for one vector path
    
      612
          const uint64_t sx_fixp_step = rounding_div(
    
      306
              ((src_width_ * kStep_ / kChannels) << kFixpBits), dst_width_);
    
      306
          uint64_t sx_fixp = to_src_x(0);
    
      612
          const uint64_t max_src_index =
    
      306
              std::max(src_width_ * kChannels - kStep_ * kRatio, 0UL);
    
          // For 1,2,4 channels dx can be iterated vector by vector, but not for 3
    
      306
          ptrdiff_t dx = 0;
    
        8/8✓ Branch 0 taken 29827 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 59729 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 30434 times.
✓ Branch 5 taken 72 times.
✓ Branch 6 taken 61005 times.
✓ Branch 7 taken 81 times.

      181301
          for (auto pcit = begin(); pcit.index_ < n_iterations_;
    
      180995
               ++pcit, dx += kStep_ / kChannels) {
    
            // Repeatedly adding sx_fixp_vector_step is faster than multiplication,
    
            // but it accumulates fixed-point error; periodic recalibration resets
    
            // it. The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1
    
            // << 16). Only the upper 8 bits of the 16-bit fractional part are used
    
            // for interpolation, so once the accumulated error reaches 1 / (1 <<
    
            // 8), it can affect later stages. This corresponds to 512 additions,
    
            // which is calculated by this mask.
    
      180995
            constexpr uint64_t kRecalibrateCycleMask = ((1 << 9) - 1);
    
        8/8✓ Branch 0 taken 29699 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 59534 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 30305 times.
✓ Branch 5 taken 129 times.
✓ Branch 6 taken 60808 times.
✓ Branch 7 taken 197 times.

      180995
            if ((pcit.index_ & kRecalibrateCycleMask) == 0) {
    
      649
              sx_fixp = to_src_x(dx);
    
      649
            }
    
        8/8✓ Branch 0 taken 29796 times.
✓ Branch 1 taken 31 times.
✓ Branch 2 taken 59668 times.
✓ Branch 3 taken 61 times.
✓ Branch 4 taken 30405 times.
✓ Branch 5 taken 29 times.
✓ Branch 6 taken 60948 times.
✓ Branch 7 taken 57 times.

      180995
            n_iterations_2x_ = (sx_fixp >> kFixpBits) * kChannels <= max_src_index
    
      180817
                                   ? pcit.index_
    
      178
                                   : n_iterations_2x_;
    
      180995
            calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b, vsx0t, vsx1b,
    
                                                 vsx1t, vsxfrac_bottom_tbl,
    
                                                 vsxfrac_top_tbl, vchannels);
    
      180995
            sx_fixp += sx_fixp_step;
    
      180995
          }
    
      306
          return true;
    
      318
        }
    
      139
        bool precalculate_indices_fractions_srcindices_3ch() KLEIDICV_STREAMING {
    
        4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 69 times.

      139
          if (!allocate_temp_buffers()) {
    
      6
            return false;
    
          }
    
          // These starting values are not aligned to center. The center alignment
    
          // must be added only once. When added to a center-aligned source_x
    
          // value, the result will be center-aligned.
    
      133
          svuint32_t vsx0b_R = make_vsx0(0);
    
      133
          svuint32_t vsx0t_R = make_vsx0(1);
    
      133
          svuint32_t vsx1b_R = make_vsx0(2 * svcntw());
    
      133
          svuint32_t vsx1t_R = make_vsx0(2 * svcntw() + 1);
    
      133
          svuint32_t vsx0b_G = make_vsx0(4 * svcntw());
    
      133
          svuint32_t vsx0t_G = make_vsx0(4 * svcntw() + 1);
    
      133
          svuint32_t vsx1b_G = make_vsx0(6 * svcntw());
    
      133
          svuint32_t vsx1t_G = make_vsx0(6 * svcntw() + 1);
    
      133
          svuint32_t vsx0b_B = make_vsx0(8 * svcntw());
    
      133
          svuint32_t vsx0t_B = make_vsx0(8 * svcntw() + 1);
    
      133
          svuint32_t vsx1b_B = make_vsx0(10 * svcntw());
    
      133
          svuint32_t vsx1t_B = make_vsx0(10 * svcntw() + 1);
    
      133
          size_t kVL = svcntb();
    
      133
          svuint8_t vchannels_R = svindex_u8(0, 1);
    
      133
          svuint8_t vchannels_G = svindex_u8(kVL % 3, 1);
    
      133
          svuint8_t vchannels_B = svindex_u8((kVL + kVL) % 3, 1);
    
          // Decrease by 3 while they are >= 3 --> so we get the modulo
    
      133
          size_t steps = (kVL - 1) / 3;
    
        4/4✓ Branch 0 taken 992 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 1081 times.
✓ Branch 3 taken 69 times.

      2206
          for (size_t i = 0; i < steps; ++i) {
    
      4146
            vchannels_R = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_R, 3),
    
      2073
                                       vchannels_R, 3);
    
      4146
            vchannels_G = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_G, 3),
    
      2073
                                       vchannels_G, 3);
    
      4146
            vchannels_B = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_B, 3),
    
      2073
                                       vchannels_B, 3);
    
      2073
          }
    
          // from each even 16bit element, take the low byte, and the high is 0
    
      266
          svuint8_t vsxfrac_bottom_tbl =
    
      133
              svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004));
    
          // from each odd 16bit element, take the low byte, and the high is 0
    
      266
          svuint8_t vsxfrac_top_tbl =
    
      133
              svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004));
    
          // Difference in source x coordinate, for three vector paths (one iteration
    
          // in this calculation)
    
      266
          const uint64_t sx_fixp_step3 =
    
      133
              rounding_div((src_width_ * kStep_) << kFixpBits, dst_width_);
    
      133
          uint64_t sx_fixp = to_src_x(0);
    
      266
          const uint64_t max_src_index =
    
      133
              std::max(src_width_ * kChannels - kStep_ * kRatio, 0UL);
    
      133
          ptrdiff_t dx = 0;
    
      133
          auto pcit = begin();
    
        4/4✓ Branch 0 taken 29 times.
✓ Branch 1 taken 29837 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 30590 times.

      60474
          while (pcit.index_ < n_iterations_) {
    
            // Repeatedly adding sx_fixp_vector_step is faster than multiplication,
    
            // but it accumulates fixed-point error; periodic recalibration resets
    
            // it. The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1
    
            // << 16). Only the upper 8 bits of the 16-bit fractional part are used
    
            // for interpolation, so once the accumulated error reaches 1 / (1 <<
    
            // 8), it can affect later stages. This corresponds to 512 additions,
    
            // but it will trigger each 3rd time, so the mask should be set to 128.
    
      60427
            constexpr uint64_t kRecalibrateCycleMask = ((1 << 7) - 1);
    
        4/4✓ Branch 0 taken 29544 times.
✓ Branch 1 taken 293 times.
✓ Branch 2 taken 30286 times.
✓ Branch 3 taken 304 times.

      60427
            if ((pcit.index_ & kRecalibrateCycleMask) == 0) {
    
      597
              sx_fixp = to_src_x(dx);
    
      597
            }
    
      60427
            calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_R, vsx0t_R,
    
                                                 vsx1b_R, vsx1t_R, vsxfrac_bottom_tbl,
    
                                                 vsxfrac_top_tbl, vchannels_R);
    
        4/4✓ Branch 0 taken 29822 times.
✓ Branch 1 taken 15 times.
✓ Branch 2 taken 30580 times.
✓ Branch 3 taken 10 times.

      60427
            n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index
    
      60402
                                   ? pcit.index_
    
      25
                                   : n_iterations_2x_;
    
      60427
            ++pcit;
    
        4/4✓ Branch 0 taken 29 times.
✓ Branch 1 taken 29808 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 30580 times.

      60427
            if (pcit.index_ >= n_iterations_) {
    
      39
              break;
    
            }
    
      60388
            calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_G, vsx0t_G,
    
                                                 vsx1b_G, vsx1t_G, vsxfrac_bottom_tbl,
    
                                                 vsxfrac_top_tbl, vchannels_G);
    
        4/4✓ Branch 0 taken 29790 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 30537 times.
✓ Branch 3 taken 43 times.

      60388
            n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index
    
      60327
                                   ? pcit.index_
    
      61
                                   : n_iterations_2x_;
    
      60388
            ++pcit;
    
        4/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 29802 times.
✓ Branch 2 taken 41 times.
✓ Branch 3 taken 30539 times.

      60388
            if (pcit.index_ >= n_iterations_) {
    
      47
              break;
    
            }
    
      60341
            calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_B, vsx0t_B,
    
                                                 vsx1b_B, vsx1t_B, vsxfrac_bottom_tbl,
    
                                                 vsxfrac_top_tbl, vchannels_B);
    
        4/4✓ Branch 0 taken 29771 times.
✓ Branch 1 taken 31 times.
✓ Branch 2 taken 30519 times.
✓ Branch 3 taken 20 times.

      60341
            n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index
    
      60290
                                   ? pcit.index_
    
      51
                                   : n_iterations_2x_;
    
      60341
            ++pcit;
    
      60341
            sx_fixp += sx_fixp_step3;
    
      60341
            dx += kStep_;
    
      60427
          }
    
      133
          return true;
    
      139
        }
    
      17206
        size_t n_iterations() const KLEIDICV_STREAMING { return n_iterations_; }
    
      202472
        size_t n_iterations_2x() const KLEIDICV_STREAMING { return n_iterations_2x_; }
    
        uint64_t *src_bases() const KLEIDICV_STREAMING {
    
          return precalc_src_bases_.get();
    
        }
    
        uint8_t *idx_frac() const KLEIDICV_STREAMING {
    
          return precalc_idx_frac_.get();
    
        }
    
       private:
    
        using FreeDeleter = decltype(&std::free);
    
      457
        bool allocate_temp_buffers() KLEIDICV_STREAMING {
    
          // Allocate a bit more so don't have to care about overindexing
    
      457
          ptrdiff_t rounded_width = align_up(dst_width_ * kChannels, kStep_);
    
      457
          n_iterations_ = rounded_width / kStep_;
    
      457
          size_t idx_bytes = sizeof(uint8_t) * rounded_width;
    
      457
          size_t xfrac_bytes = sizeof(uint16_t) * rounded_width;
    
      914
          precalc_idx_frac_.reset(
    
      457
              static_cast<uint8_t *>(malloc(idx_bytes + xfrac_bytes)));
    
      457
          size_t src_bases_bytes = sizeof(uint64_t) * rounded_width / kStep_;
    
      457
          precalc_src_bases_.reset(static_cast<uint64_t *>(malloc(src_bases_bytes)));
    
      1371
          return (reinterpret_cast<uintptr_t>(precalc_idx_frac_.get()) &
    
      457
                  reinterpret_cast<uintptr_t>(precalc_src_bases_.get()));
    
      457
        }
    
        template <typename T = uint64_t>
    
      35868
        static T rounding_div(uint64_t nom, uint64_t denom) KLEIDICV_STREAMING {
    
      35868
          return static_cast<T>((nom + denom / 2) / denom);
    
        }
    
        // Scale coordinate using this formula, so the center is aligned:
    
        //   source_x = (destination_x + 0.5) / scale - 0.5;
    
        //   plus 1/256/2 for later rounding the fractional part to 8bits
    
      1685
        static uint64_t aligned_scale(uint64_t x, uint64_t nom,
    
                                      uint64_t denom) KLEIDICV_STREAMING {
    
      3370
          return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) -
    
      1685
                 kFixpHalf + (1 << (kFixpBits - 9));
    
        }
    
      1685
        uint64_t to_src_x(uint64_t dx) const KLEIDICV_STREAMING {
    
      1685
          return aligned_scale(dx, src_width_, dst_width_);
    
        }
    
        // Scale destination x coordinate to source x coordinate, into fixed-point,
    
        // without center correction
    
      33744
        uint32_t scale_x(uint64_t dx) const KLEIDICV_STREAMING {
    
      33744
          return rounding_div<uint32_t>(((dx * src_width_) << kFixpBits), dst_width_);
    
        }
    
      2820
        svuint32_t make_vsx0(uint64_t dx) const KLEIDICV_STREAMING {
    
          // Creates source x coordinates starting with dx, stepping by 2
    
          // and finally shifted left by 8, to support the later svaddhn operation
    
      2820
          uint32_t sx[64];  // maximum possible vector length in u32 units
    
        12/12✓ Branch 0 taken 288 times.
✓ Branch 1 taken 3456 times.
✓ Branch 2 taken 324 times.
✓ Branch 3 taken 3888 times.
✓ Branch 4 taken 768 times.
✓ Branch 5 taken 9120 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 3456 times.
✓ Branch 8 taken 324 times.
✓ Branch 9 taken 3888 times.
✓ Branch 10 taken 828 times.
✓ Branch 11 taken 9936 times.

      36564
          for (size_t i = 0; i < svcntw(); ++i) {
    
      33744
            sx[i] = scale_x((dx + 2 * i) / kChannels) << 8;
    
      33744
          }
    
      5640
          return svld1(svptrue_b32(), sx);
    
      2820
        }
    
      362151
        void calculate_indices_fractions_srcindex(
    
            PrecalcIterator<kRatio> &pcit, uint64_t sx_fixp, const svuint32_t &vsx0b,
    
            const svuint32_t &vsx0t, const svuint32_t &vsx1b, const svuint32_t &vsx1t,
    
            const svuint8_t &vsxfrac_bottom_tbl, const svuint8_t &vsxfrac_top_tbl,
    
            [[maybe_unused]] const svuint8_t &vchannels) const KLEIDICV_STREAMING {
    
          // << 8: to prepare for addhn, have the fractional part in the high half
    
      724302
          uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp & ((1 << kFixpBits) - 1))
    
      362151
                            << 8;
    
          // get the interesting part: 8+8 bits of integer and fractional part
    
      724302
          svuint16x2_t vsx_delta =
    
      724302
              svcreate2(svaddhnt_n_u32(svaddhnb_n_u32(vsx0b, xfrac0), vsx0t, xfrac0),
    
      362151
                        svaddhnt_n_u32(svaddhnb_n_u32(vsx1b, xfrac0), vsx1t, xfrac0));
    
          if constexpr (kChannels == 3) {
    
            // When vsx0 starts from other than zero, this offset must be subtracted
    
      181156
            uint16_t start{};
    
      181156
            svst1(svptrue_pat_b16(SV_VL1), &start, svget2(vsx_delta, 0));
    
      181156
            start = start & 0xFF00;
    
      181156
            vsx_delta =
    
      362312
                svcreate2(svsub_n_u16_x(svptrue_b16(), svget2(vsx_delta, 0), start),
    
      181156
                          svsub_n_u16_x(svptrue_b16(), svget2(vsx_delta, 1), start));
    
      181156
            sx_fixp += (start >> 8) << kFixpBits;
    
      181156
          }
    
      724302
          svuint8x2_t vsx_delta8 =
    
      724302
              svcreate2(svreinterpret_u8_u16(svget2(vsx_delta, 0)),
    
      362151
                        svreinterpret_u8_u16(svget2(vsx_delta, 1)));
    
          // left pixels' indices: integer part
    
      724302
          svuint8_t vsx_left_idx =
    
      362151
              svuzp2_u8(svget2(vsx_delta8, 0), svget2(vsx_delta8, 1));
    
          if constexpr (kChannels > 1) {
    
            if constexpr (kChannels == 3) {
    
      181156
              vsx_left_idx = svmul_n_u8_x(svptrue_b8(), vsx_left_idx, 3);
    
            } else {
    
              static_assert(kChannels == 2 || kChannels == 4);
    
      120734
              vsx_left_idx =
    
      120734
                  svlsl_n_u8_x(svptrue_b8(), vsx_left_idx, kChannels == 4 ? 2 : 1);
    
            }
    
      301890
            vsx_left_idx = svadd_u8_x(svptrue_b8(), vsx_left_idx, vchannels);
    
          }
    
      362151
          uint64_t srcindex = (sx_fixp >> kFixpBits) * kChannels;
    
          if constexpr (kChannels == 3) {
    
            // When vsx_left_idx starts from other than zero, this offset must be
    
            // subtracted
    
      181156
            uint8_t start{};
    
      181156
            svst1(svptrue_pat_b8(SV_VL1), &start, vsx_left_idx);
    
      181156
            vsx_left_idx = svsub_n_u8_x(svptrue_b8(), vsx_left_idx, start);
    
      181156
            srcindex += start;
    
      181156
          }
    
      362151
          *pcit.src_index_ptr_ = srcindex;
    
      362151
          svst1(svptrue_b8(), pcit.idx_ptr_, vsx_left_idx);
    
          // fractional part is widened to 16 bits for further operations
    
      724302
          svuint16_t vsxfrac_b =
    
      362151
              svreinterpret_u16_u8(svtbl2_u8(vsx_delta8, vsxfrac_bottom_tbl));
    
      724302
          svuint16_t vsxfrac_t =
    
      362151
              svreinterpret_u16_u8(svtbl2_u8(vsx_delta8, vsxfrac_top_tbl));
    
      362151
          svst1(svptrue_b16(), pcit.frac_ptr_, vsxfrac_b);
    
      362151
          svst1_vnum(svptrue_b16(), pcit.frac_ptr_, 1, vsxfrac_t);
    
      362151
        }
    
        const size_t src_width_;
    
        const size_t dst_width_;
    
        size_t n_iterations_;
    
        size_t n_iterations_2x_;
    
        const ptrdiff_t kStep_;
    
        std::unique_ptr<uint64_t, FreeDeleter> precalc_src_bases_;
    
        std::unique_ptr<uint8_t, FreeDeleter> precalc_idx_frac_;
    
      };
    
      // ratio: number of vectors to load and resize to 1 vector
    
      // - supported combinations of (ratio, channel):
    
      // (2, 1), (2, 2), (2, 3), (3, 1), (3, 2), (3, 3)
    
      template <ptrdiff_t kRatio, ptrdiff_t kChannels>
    
      class ResizeGenericU8Operation final {
    
       public:
    
      457
        ResizeGenericU8Operation(const uint8_t *src, size_t src_stride,
    
                                 size_t src_width, size_t src_height, size_t y_begin,
    
                                 size_t y_end,
    
                                 uint8_t *dst,  // NOLINT
    
                                 size_t dst_stride, size_t dst_width,
    
                                 size_t dst_height) KLEIDICV_STREAMING
    
      457
            : src_rows_{src, src_stride, kChannels},
    
      457
              dst_rows_{dst, dst_stride, kChannels},
    
      457
              src_width_{src_width},
    
      457
              src_height_{src_height},
    
      457
              y_begin_{y_begin},
    
      457
              y_end_{y_end},
    
      457
              dst_width_{dst_width},
    
      457
              dst_height_{dst_height},
    
      457
              kStep_{static_cast<ptrdiff_t>(svcntb())},
    
      457
              precalc_{src_width, dst_width, kStep_} {}
    
      457
        kleidicv_error_t process_rows() KLEIDICV_STREAMING {
    
      457
          bool precalc_success = false;
    
          if constexpr (kChannels == 3) {
    
      139
            precalc_success =
    
      139
                precalc_.precalculate_indices_fractions_srcindices_3ch();
    
          } else {
    
      318
            precalc_success = precalc_.precalculate_indices_fractions_srcindices();
    
          }
    
        12/12✓ Branch 0 taken 72 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 81 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 64 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 72 times.
✓ Branch 7 taken 3 times.
✓ Branch 8 taken 81 times.
✓ Branch 9 taken 3 times.
✓ Branch 10 taken 69 times.
✓ Branch 11 taken 3 times.

      457
          if (!precalc_success) {
    
      18
            return KLEIDICV_ERROR_ALLOCATION;
    
          }
    
        12/12✓ Branch 0 taken 672 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1032 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 964 times.
✓ Branch 5 taken 64 times.
✓ Branch 6 taken 498 times.
✓ Branch 7 taken 72 times.
✓ Branch 8 taken 858 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 804 times.
✓ Branch 11 taken 69 times.

      5267
          for (uint64_t dst_y = y_begin_; dst_y < y_end_; ++dst_y) {
    
      4828
            process_row(dst_y);
    
      4828
          }
    
      439
          return KLEIDICV_OK;
    
      457
        }
    
       private:
    
        template <typename T = uint64_t>
    
      4828
        static T rounding_div(uint64_t nom, uint64_t denom) KLEIDICV_STREAMING {
    
      4828
          return static_cast<T>((nom + denom / 2) / denom);
    
        }
    
        // Scale coordinate using this formula, so the center is aligned:
    
        //   source_x = (destination_x + 0.5) / scale - 0.5;
    
        //   plus 1/256/2 for later rounding the fractional part to 8bits
    
      4828
        static uint64_t aligned_scale(uint64_t x, uint64_t nom,
    
                                      uint64_t denom) KLEIDICV_STREAMING {
    
      9656
          return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) -
    
      4828
                 kFixpHalf + (1 << (kFixpBits - 9));
    
        }
    
      4828
        uint64_t to_src_y(uint64_t dy) const KLEIDICV_STREAMING {
    
      4828
          return aligned_scale(dy, src_height_, dst_height_);
    
        }
    
      1222998
        static svuint16_t svshll8b(svuint8_t a) KLEIDICV_STREAMING {
    
      1222998
          return svreinterpret_u16_u8(svtrn1(svdup_n_u8(0), a));
    
        }
    
      1222998
        static svuint16_t svshll8t(svuint8_t a) KLEIDICV_STREAMING {
    
      1222998
          return svreinterpret_u16_u8(svtrn2(svdup_n_u8(0), a));
    
        }
    
      398404
        static svuint8x2_t load8x2_u8(const uint8_t *p) KLEIDICV_STREAMING {
    
      #if KLEIDICV_TARGET_SME2
    
      64120
          return svld1_x2(svptrue_c8(), p);
    
      #else
    
      334284
          return svcreate2(svld1(svptrue_b8(), p), svld1_vnum(svptrue_b8(), p, 1));
    
      #endif
    
        }
    
      15268
        svuint8x2_t load8x2_while_u8(const uint8_t *p, uint64_t i,
    
                                     uint64_t n) const KLEIDICV_STREAMING {
    
      #if KLEIDICV_TARGET_SME2
    
      5264
          return svld1_x2(svwhilelt_c8(i, n, 2), p);
    
      #else
    
      10004
          svbool_t pg1 = svwhilelt_b8(i, n);
    
      10004
          svbool_t pg2 = svwhilelt_b8(i + kStep_, n);
    
      20008
          return svcreate2(svld1(pg1, p), svld1_vnum(pg2, p, 1));
    
      #endif
    
      10004
        }
    
      392172
        static svuint8x3_t load8x3_u8(const uint8_t *p) KLEIDICV_STREAMING {
    
      #if KLEIDICV_TARGET_SME2
    
      64196
          svuint8x2_t sv2 = svld1_x2(svptrue_c8(), p);
    
      192588
          return svcreate3(svget2(sv2, 0), svget2(sv2, 1),
    
      64196
                           svld1_vnum(svptrue_b8(), p, 2));
    
      #else
    
      655952
          return svcreate3(svld1(svptrue_b8(), p), svld1_vnum(svptrue_b8(), p, 1),
    
      327976
                           svld1_vnum(svptrue_b8(), p, 2));
    
      #endif
    
      64196
        }
    
      9488
        svuint8x3_t load8x3_while_u8(const uint8_t *p, uint64_t i,
    
                                     uint64_t n) const KLEIDICV_STREAMING {
    
      #if KLEIDICV_TARGET_SME2
    
      3050
          svcount_t pgc = svwhilelt_c8(i, n, 2);
    
      3050
          svbool_t pgb = svwhilelt_b8(i + 2 * kStep_, n);
    
      3050
          svuint8x2_t sv2 = svld1_x2(pgc, p);
    
      6100
          return svcreate3(svget2(sv2, 0), svget2(sv2, 1), svld1_vnum(pgb, p, 2));
    
      #else
    
      6438
          svbool_t pg1 = svwhilelt_b8(i, n);
    
      6438
          svbool_t pg2 = svwhilelt_b8(i + kStep_, n);
    
      6438
          svbool_t pg3 = svwhilelt_b8(i + 2 * kStep_, n);
    
      19314
          return svcreate3(svld1(pg1, p), svld1_vnum(pg2, p, 1),
    
      6438
                           svld1_vnum(pg3, p, 2));
    
      #endif
    
      9488
        }
    
      407666
        svuint8_t interpolate(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac,
    
                              svuint8_t a, svuint8_t b, svuint8_t c,
    
                              svuint8_t d) const KLEIDICV_STREAMING {
    
      #if KLEIDICV_TARGET_SME2
    
      68315
          svuint16x2_t vsxfrac = svld1_x2(svptrue_c8(), pcit.frac_ptr_);
    
      68315
          svuint16_t vsxfrac_b = svget2(vsxfrac, 0);
    
      68315
          svuint16_t vsxfrac_t = svget2(vsxfrac, 1);
    
      #else
    
      339351
          svuint16_t vsxfrac_b = svld1(svptrue_b16(), pcit.frac_ptr_);
    
      339351
          svuint16_t vsxfrac_t = svld1_vnum(svptrue_b16(), pcit.frac_ptr_, 1);
    
      #endif
    
      407666
          svuint16_t half = svdup_n_u16(128);
    
      815332
          svuint8_t left = svaddhnb(
    
      407666
              svshll8b(a), svmla_n_u16_x(svptrue_b16(), half, svsublb(c, a), yfrac));
    
      815332
          svuint8_t right = svaddhnb(
    
      407666
              svshll8b(b), svmla_n_u16_x(svptrue_b16(), half, svsublb(d, b), yfrac));
    
      815332
          left = svaddhnt(left, svshll8t(a),
    
      407666
                          svmla_n_u16_x(svptrue_b16(), half, svsublt(c, a), yfrac));
    
      815332
          right = svaddhnt(right, svshll8t(b),
    
      407666
                           svmla_n_u16_x(svptrue_b16(), half, svsublt(d, b), yfrac));
    
      815332
          svuint8_t res =
    
      815332
              svaddhnb(svshll8b(left),
    
      407666
                       svmla_x(svptrue_b16(), half, svsublb(right, left), vsxfrac_b));
    
      815332
          return svaddhnt(
    
      407666
              res, svshll8t(left),
    
      407666
              svmla_x(svptrue_b16(), half, svsublt(right, left), vsxfrac_t));
    
      407666
        }
    
      206836
        svuint8_t common_vector_path_r2(
    
            const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, svuint8x2_t topsrc,
    
            svuint8x2_t bottomsrc) const KLEIDICV_STREAMING {
    
      206836
          svuint8_t vsx0_idx = svld1(svptrue_b8(), pcit.idx_ptr_);
    
      206836
          svuint8_t vsx1_idx = svadd_n_u8_x(svptrue_b8(), vsx0_idx, kChannels);
    
      206836
          svuint8_t a = svtbl2_u8(topsrc, vsx0_idx);
    
      206836
          svuint8_t b = svtbl2_u8(topsrc, vsx1_idx);
    
      206836
          svuint8_t c = svtbl2_u8(bottomsrc, vsx0_idx);
    
      206836
          svuint8_t d = svtbl2_u8(bottomsrc, vsx1_idx);
    
      413672
          return interpolate(pcit, yfrac, a, b, c, d);
    
      206836
        }
    
      199202
        svuint8_t vector_path_r2(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac,
    
                                 const uint8_t *src_top,
    
                                 const uint8_t *src_bottom) const KLEIDICV_STREAMING {
    
          // Load 2*step elements, that's enough for 1/2 < scale < 1.0
    
      199202
          uint64_t src_index = *pcit.src_index_ptr_;
    
      199202
          svuint8x2_t topsrc = load8x2_u8(&src_top[src_index]);
    
      199202
          svuint8x2_t bottomsrc = load8x2_u8(&src_bottom[src_index]);
    
      398404
          return common_vector_path_r2(pcit, yfrac, topsrc, bottomsrc);
    
      199202
        }
    
      7634
        svuint8_t remaining_path_r2(const PrecalcIterator<kRatio> &pcit,
    
                                    uint16_t yfrac, const uint8_t *src_top,
    
                                    const uint8_t *src_bottom) const
    
            KLEIDICV_STREAMING {
    
          // Load 2*step elements, that's enough for 1/2 < scale < 1.0
    
      7634
          uint64_t src_index = *pcit.src_index_ptr_;
    
      15268
          svuint8x2_t topsrc = load8x2_while_u8(&src_top[src_index], src_index,
    
      7634
                                                src_width_ * kChannels);
    
      15268
          svuint8x2_t bottomsrc = load8x2_while_u8(&src_bottom[src_index], src_index,
    
      7634
                                                   src_width_ * kChannels);
    
      15268
          return common_vector_path_r2(pcit, yfrac, topsrc, bottomsrc);
    
      7634
        }
    
      200830
        svuint8_t common_vector_path_r3(
    
            const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, svuint8x3_t topsrc,
    
            svuint8x3_t bottomsrc) const KLEIDICV_STREAMING {
    
      200830
          svuint8_t vsx0_idx = svld1(svptrue_b8(), pcit.idx_ptr_);
    
      200830
          svuint8_t vsx1_idx = svadd_n_u8_x(svptrue_b8(), vsx0_idx, kChannels);
    
      401660
          svuint8_t a =
    
      200830
              svtbl2_u8(svcreate2(svget3(topsrc, 0), svget3(topsrc, 1)), vsx0_idx);
    
      401660
          svuint8_t b =
    
      200830
              svtbl2_u8(svcreate2(svget3(topsrc, 0), svget3(topsrc, 1)), vsx1_idx);
    
      602490
          svuint8_t c = svtbl2_u8(
    
      401660
              svcreate2(svget3(bottomsrc, 0), svget3(bottomsrc, 1)), vsx0_idx);
    
      602490
          svuint8_t d = svtbl2_u8(
    
      401660
              svcreate2(svget3(bottomsrc, 0), svget3(bottomsrc, 1)), vsx1_idx);
    
      200830
          vsx0_idx =
    
      200830
              svsub_n_u8_x(svptrue_b8(), vsx0_idx, static_cast<uint8_t>(2 * kStep_));
    
      200830
          vsx1_idx =
    
      200830
              svsub_n_u8_x(svptrue_b8(), vsx1_idx, static_cast<uint8_t>(2 * kStep_));
    
      200830
          a = svtbx_u8(a, svget3(topsrc, 2), vsx0_idx);
    
      200830
          b = svtbx_u8(b, svget3(topsrc, 2), vsx1_idx);
    
      200830
          c = svtbx_u8(c, svget3(bottomsrc, 2), vsx0_idx);
    
      200830
          d = svtbx_u8(d, svget3(bottomsrc, 2), vsx1_idx);
    
      401660
          return interpolate(pcit, yfrac, a, b, c, d);
    
      200830
        }
    
      196086
        svuint8_t vector_path_r3(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac,
    
                                 const uint8_t *src_top,
    
                                 const uint8_t *src_bottom) const KLEIDICV_STREAMING {
    
          // Load 3*2*step elements, that's enough for 1/3 < scale < 1.0
    
      196086
          uint64_t src_index = *pcit.src_index_ptr_;
    
      196086
          svuint8x3_t topsrc = load8x3_u8(&src_top[src_index]);
    
      196086
          svuint8x3_t bottomsrc = load8x3_u8(&src_bottom[src_index]);
    
      392172
          return common_vector_path_r3(pcit, yfrac, topsrc, bottomsrc);
    
      196086
        }
    
      4744
        svuint8_t remaining_path_r3(const PrecalcIterator<kRatio> &pcit,
    
                                    uint16_t yfrac, const uint8_t *src_top,
    
                                    const uint8_t *src_bottom) const
    
            KLEIDICV_STREAMING {
    
          // Load 3*step elements, that's enough for 1/3 < scale < 1.0
    
      4744
          uint64_t src_index = *pcit.src_index_ptr_;
    
      9488
          svuint8x3_t topsrc = load8x3_while_u8(&src_top[src_index], src_index,
    
      4744
                                                src_width_ * kChannels);
    
      9488
          svuint8x3_t bottomsrc = load8x3_while_u8(&src_bottom[src_index], src_index,
    
      4744
                                                   src_width_ * kChannels);
    
      9488
          return common_vector_path_r3(pcit, yfrac, topsrc, bottomsrc);
    
      4744
        }
    
      4828
        void process_row(uint64_t dy) const KLEIDICV_STREAMING {
    
      4828
          uint64_t sy_fixp = to_src_y(dy);
    
      4828
          ptrdiff_t sy = static_cast<ptrdiff_t>(sy_fixp >> kFixpBits);
    
      4828
          const uint8_t *src_top = &src_rows_.at(sy)[0];
    
      4828
          const uint8_t *src_bottom = &src_rows_.at(sy + 1)[0];
    
      4828
          uint8_t *dst = &dst_rows_.at(static_cast<ptrdiff_t>(dy))[0];
    
      4828
          uint8_t *dst_end = dst + dst_width_ * kChannels;
    
          // Get the highest 8 bits of the fractional part
    
          // This is a good compromise between accuracy and performance
    
          // Because the result is 8bits, the error only affects the least
    
          // significant 1-2 bits, see the accuracy calculation in kleidicv.h
    
      9656
          uint16_t yfrac =
    
      4828
              static_cast<uint16_t>((sy_fixp - (sy << kFixpBits)) >> (kFixpBits - 8));
    
      4828
          auto pcit = precalc_.begin();
    
        12/12✓ Branch 0 taken 15815 times.
✓ Branch 1 taken 672 times.
✓ Branch 2 taken 33519 times.
✓ Branch 3 taken 1032 times.
✓ Branch 4 taken 50267 times.
✓ Branch 5 taken 964 times.
✓ Branch 6 taken 15414 times.
✓ Branch 7 taken 498 times.
✓ Branch 8 taken 32692 times.
✓ Branch 9 taken 858 times.
✓ Branch 10 taken 49937 times.
✓ Branch 11 taken 804 times.

      202472
          while (pcit.index_ + 1 < precalc_.n_iterations_2x()) {
    
      197644
            svuint8_t res0, res1;
    
            if constexpr (kRatio == 3) {
    
      98043
              res0 = vector_path_r3(pcit, yfrac, src_top, src_bottom);
    
      98043
              ++pcit;
    
      98043
              res1 = vector_path_r3(pcit, yfrac, src_top, src_bottom);
    
      98043
              ++pcit;
    
            } else if constexpr (kRatio == 2) {
    
      99601
              res0 = vector_path_r2(pcit, yfrac, src_top, src_bottom);
    
      99601
              ++pcit;
    
      99601
              res1 = vector_path_r2(pcit, yfrac, src_top, src_bottom);
    
      99601
              ++pcit;
    
            }
    
      #if KLEIDICV_TARGET_SME2
    
      32079
            svst1(svptrue_c8(), dst, svcreate2(res0, res1));
    
      #else
    
      165565
            svst1(svptrue_b8(), dst, res0);
    
      165565
            svst1_vnum(svptrue_b8(), dst, 1, res1);
    
      #endif  // KLEIDICV_TARGET_SME2
    
      197644
            dst += 2 * kStep_;
    
      197644
          }
    
          // similar to above, but only a single vector path and with predicates
    
        12/12✓ Branch 0 taken 1895 times.
✓ Branch 1 taken 672 times.
✓ Branch 2 taken 2635 times.
✓ Branch 3 taken 1032 times.
✓ Branch 4 taken 3104 times.
✓ Branch 5 taken 964 times.
✓ Branch 6 taken 968 times.
✓ Branch 7 taken 498 times.
✓ Branch 8 taken 2062 times.
✓ Branch 9 taken 858 times.
✓ Branch 10 taken 1714 times.
✓ Branch 11 taken 804 times.

      17206
          while (pcit.index_ < precalc_.n_iterations()) {
    
      12378
            svbool_t pgdst = svwhilelt_b8(0L, dst_end - dst);
    
      12378
            svuint8_t res;
    
            if constexpr (kRatio == 2) {
    
      7634
              res = remaining_path_r2(pcit, yfrac, src_top, src_bottom);
    
            } else if constexpr (kRatio == 3) {
    
      4744
              res = remaining_path_r3(pcit, yfrac, src_top, src_bottom);
    
            }
    
      12378
            svst1(pgdst, dst, res);
    
      12378
            ++pcit;
    
      12378
            dst += kStep_;
    
      12378
          }
    
      4828
        }
    
        const Rows<const uint8_t> src_rows_;
    
        const Rows<uint8_t> dst_rows_;
    
        const size_t src_width_;
    
        const size_t src_height_;
    
        const size_t y_begin_;
    
        const size_t y_end_;
    
        const size_t dst_width_;
    
        const size_t dst_height_;
    
        const ptrdiff_t kStep_;
    
        PrecalcIndicesFractions<kRatio, kChannels> precalc_;
    
      };
    
      }  // namespace resize_generic_u8
    
      // ratio: number of vectors to load and resize to 1 vector
    
      // - supported combinations of (ratio, channel): (2, 1), (2, 2), (3, 1), (3,
    
      // 2)
    
      template <ptrdiff_t kRatio, ptrdiff_t kChannels>
    
      457
      kleidicv_error_t kleidicv_resize_generic_stripe_u8_sc(
    
          const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end,
    
          uint8_t *dst,  // NOLINT
    
          size_t dst_stride, size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
    
      914
        resize_generic_u8::ResizeGenericU8Operation<kRatio, kChannels> operation(
    
      457
            src, src_stride, src_width, src_height, y_begin, y_end, dst, dst_stride,
    
      457
            dst_width, dst_height);
    
      457
        return operation.process_rows();
    
      457
      }
    
      #define KLEIDICV_INSTANTIATE_TEMPLATE_SC(ratio, channels)            \
    
        template kleidicv_error_t                                          \
    
        kleidicv_resize_generic_stripe_u8_sc<ratio, channels>(             \
    
            const uint8_t *src, size_t src_stride, size_t src_width,       \
    
            size_t src_height, size_t y_begin, size_t y_end, uint8_t *dst, \
    
            size_t dst_stride, size_t dst_width, size_t dst_height)        \
    
            KLEIDICV_STREAMING
    
      KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 1L);
    
      KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 2L);
    
      KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 3L);
    
      KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 1L);
    
      KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 2L);
    
      KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 3L);
    
      }  // namespace KLEIDICV_TARGET_NAMESPACE
    
      #endif  // KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H