KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/resize/resize_linear_generic_u8_neon.h
Date:	2026-03-05 15:57:40
	Exec	Total	Coverage
Lines:	478	489	97.8%
Functions:	144	153	94.1%
Branches:	151	274	55.1%
  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <algorithm>
    
      #include <cstddef>
    
      #include <cstdint>
    
      #include <cstdlib>
    
      #include <memory>
    
      #include <utility>
    
      #include <variant>
    
      #include "kleidicv/ctypes.h"
    
      #include "kleidicv/neon.h"
    
      #include "kleidicv/utils.h"
    
      namespace kleidicv::neon::resize_linear_generic_u8 {
    
      //------------------------------------------------------
    
      /// Generic resize for ratios 1/3 to 1/1, u8
    
      //------------------------------------------------------
    
      // For the coordinate calculation, fixed-point format is used, for better
    
      // performance. Fixed-point format:
    
      // - lowest 16 bits are the fractional part, that is the kFixpBits constant
    
      // - at interpolation, the high 8 bits are used from the fractional part
    
      //   (this is a good compromise between accuracy and performance: because the
    
      //   result is 8bits, the error only affects the least significant 1-2 bits, see
    
      //   the accuracy calculation in kleidicv.h
    
      // - to get the integer part, right shift by 16 bits, or zip/unzip/tbl etc. to
    
      //   get the bytes needed
    
      // - for better accuracy, rounding is needed everywhere, i.e. adding 0.5, which
    
      //   is 1 << 15
    
      static constexpr ptrdiff_t kFixpBits = 16;
    
      static constexpr ptrdiff_t kFixpHalf = (1UL << (kFixpBits - 1));
    
      static constexpr ptrdiff_t kStep = kVectorLength / sizeof(uint8_t);
    
      static constexpr ptrdiff_t kHalfStep = kStep / 2;
    
      struct FullVectorInterpolationConstants {
    
        uint8_t idx[kStep];
    
        uint16_t xfrac[kStep];
    
        ptrdiff_t src_element_index;
    
      };
    
      struct HalfVectorInterpolationConstants {
    
        uint8_t idx[kHalfStep];
    
        uint16_t xfrac[kHalfStep];
    
        ptrdiff_t src_element_index;
    
        ptrdiff_t dst_element_index;
    
      };
    
      struct VectorPathNums {
    
        size_t two_x;
    
        size_t half;
    
      313
        explicit VectorPathNums(std::pair<size_t, size_t> sizes)
    
      313
            : two_x{sizes.first}, half{sizes.second} {}
    
      };
    
      template <typename T = uint64_t>
    
      4050324
      static T rounding_div(uint64_t nom, uint64_t denom) {
    
      4050324
        return static_cast<T>((nom + denom / 2) / denom);
    
      }
    
      // Scale coordinate using this formula, so the center is aligned:
    
      //   source_x = (destination_x + 0.5) / scale - 0.5;
    
      //   plus 1/256/2 for later rounding the fractional part to 8bits
    
      5019
      static inline uint64_t aligned_scale(uint64_t x, uint64_t nom, uint64_t denom) {
    
      5019
        return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) - kFixpHalf +
    
               (1 << (kFixpBits - 9));
    
      }
    
      class RowInterpolationConstants {
    
       public:
    
        // Constructible only through create
    
        RowInterpolationConstants() = delete;
    
      313
        static std::variant<RowInterpolationConstants, kleidicv_error_t> create(
    
            VectorPathNums num_of_vector_paths) {
    
          {
    
      626
            uint8_t *allocation = static_cast<uint8_t *>(malloc(
    
      313
                num_of_vector_paths.two_x * 2 *
    
      313
                    sizeof(FullVectorInterpolationConstants) +
    
      313
                num_of_vector_paths.half * sizeof(HalfVectorInterpolationConstants)));
    
        2/2✓ Branch 0 taken 301 times.
✓ Branch 1 taken 12 times.

      313
            if (!allocation) {
    
      12
              return KLEIDICV_ERROR_ALLOCATION;
    
            }
    
      301
            return RowInterpolationConstants{num_of_vector_paths, allocation};
    
      313
          }
    
          return KLEIDICV_OK;
    
      313
        }
    
      125003
        VectorPathNums num_of_vector_paths() const { return num_of_vector_paths_; }
    
      486760
        FullVectorInterpolationConstants *full_vector_constants_array() const {
    
      486760
          return full_vector_constants_array_;
    
        }
    
      3941
        HalfVectorInterpolationConstants *half_vector_constants_array() const {
    
      3941
          return half_vector_constants_array_;
    
        }
    
       private:
    
      301
        RowInterpolationConstants(VectorPathNums num_of_vector_paths, uint8_t *buffer)
    
      301
            : buffer_{buffer, &std::free},
    
      602
              full_vector_constants_array_{
    
      301
                  reinterpret_cast<FullVectorInterpolationConstants *>(buffer)},
    
      602
              half_vector_constants_array_{
    
                  reinterpret_cast<HalfVectorInterpolationConstants *>(
    
      602
                      full_vector_constants_array_ +
    
      301
                      (num_of_vector_paths.two_x * 2))},
    
      301
              num_of_vector_paths_{num_of_vector_paths} {}
    
        using FreeDeleter = decltype(&std::free);
    
        std::unique_ptr<uint8_t, FreeDeleter> buffer_;
    
        FullVectorInterpolationConstants *const full_vector_constants_array_;
    
        HalfVectorInterpolationConstants *const half_vector_constants_array_;
    
        const VectorPathNums num_of_vector_paths_;
    
      };
    
      template <ptrdiff_t kRatio, ptrdiff_t kChannels>
    
      class RowInterpolationConstantsGeneratorBase {
    
       protected:
    
      313
        RowInterpolationConstantsGeneratorBase(size_t src_width, size_t dst_width)
    
      313
            : src_width_{src_width},
    
      313
              dst_width_{dst_width},
    
      313
              vsidx_tbl_{2, 6, 10, 14, 18, 22, 26, 30},
    
      313
              vsfrac_tbl_{1,  255, 5,  255, 9,  255, 13, 255,
    
      313
                          17, 255, 21, 255, 25, 255, 29, 255} {}
    
      313
        std::pair<size_t, size_t> calculate_num_of_vector_paths() {
    
        8/12✓ Branch 0 taken 30 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 56 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 45 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 30 times.
✓ Branch 7 taken 20 times.
✓ Branch 8 taken 56 times.
✗ Branch 9 not taken.
✓ Branch 10 taken 56 times.
✗ Branch 11 not taken.

      313
          size_t two_x = ((src_width_ * kChannels) >= (sizeof(uint8x16_t) * kRatio))
    
      273
                             ? ((dst_width_ * kChannels) / (2 * kStep))
    
                             : 0;
    
      626
          size_t remaining_dx_after_2x_cycle =
    
      313
              (dst_width_ * kChannels) - (two_x * 2 * kStep);
    
      313
          size_t half = align_up(remaining_dx_after_2x_cycle, kHalfStep) / kHalfStep;
    
      313
          return {two_x, half};
    
      313
        }
    
        // Scale destination x coordinate to source x coordinate, into fixed-point,
    
        // without center correction
    
      3882944
        uint32_t scale_x(uint64_t dx) const {
    
      3882944
          return rounding_div<uint32_t>(((dx * src_width_) << kFixpBits), dst_width_);
    
        }
    
      1787
        uint64_t to_src_x(uint64_t dx) const {
    
      1787
          return aligned_scale(dx, src_width_, dst_width_);
    
        }
    
        const size_t src_width_;
    
        const size_t dst_width_;
    
        const uint8x8_t vsidx_tbl_;
    
        const uint8x16_t vsfrac_tbl_;
    
      };
    
      template <ptrdiff_t kRatio, ptrdiff_t kChannels>
    
      class RowInterpolationConstantsGenerator final
    
          : RowInterpolationConstantsGeneratorBase<kRatio, kChannels> {
    
       public:
    
        using Base = RowInterpolationConstantsGeneratorBase<kRatio, kChannels>;
    
      212
        RowInterpolationConstantsGenerator(size_t src_width, size_t dst_width)
    
      212
            : Base{src_width, dst_width},
    
              // These starting values are not aligned to center. The center alignment
    
              // must be added only once. When added to a center-aligned source_x
    
              // value, the result will be center-aligned.
    
      636
              vsx0_0_{Base::scale_x(0), Base::scale_x(1 / kChannels),
    
      424
                      Base::scale_x(2 / kChannels), Base::scale_x(3 / kChannels)},
    
      636
              vsx0_1_{Base::scale_x(4 / kChannels), Base::scale_x(5 / kChannels),
    
      424
                      Base::scale_x(6 / kChannels), Base::scale_x(7 / kChannels)},
    
      636
              vsx0_2_{Base::scale_x(8 / kChannels), Base::scale_x(9 / kChannels),
    
      424
                      Base::scale_x(10 / kChannels), Base::scale_x(11 / kChannels)},
    
      636
              vsx0_3_{Base::scale_x(12 / kChannels), Base::scale_x(13 / kChannels),
    
      636
                      Base::scale_x(14 / kChannels), Base::scale_x(15 / kChannels)} {}
    
      212
        std::variant<RowInterpolationConstants, kleidicv_error_t> operator()() {
    
      212
          VectorPathNums v{Base::calculate_num_of_vector_paths()};
    
      212
          auto row_interpolation_constants_variant =
    
      212
              RowInterpolationConstants::create(v);
    
        8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 54 times.

      212
          if (std::holds_alternative<kleidicv_error_t>(
    
                  row_interpolation_constants_variant)) {
    
            // Creation failed with some error, return with the variant as it is
    
      8
            return row_interpolation_constants_variant;
    
          }
    
      204
          auto &row_interpolation_constants = *std::get_if<RowInterpolationConstants>(
    
              &row_interpolation_constants_variant);
    
      204
          uint64_t dx = 0;
    
      204
          uint64_t sx_fixp = 0;
    
          // Calculate constants for full vectors
    
          // Maximum source coordinate for vector path 2x
    
      408
          const uint64_t max_sx_2x =
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      612
              std::max(Base::src_width_ * kChannels - (sizeof(uint8x16_t) * kRatio),
    
      408
                       0UL) /
    
              kChannels;
    
          // Difference in source x coordinate for one vector path
    
      408
          const uint64_t sx_fixp_vector_step = rounding_div(
    
      204
              (Base::src_width_ * kStep / kChannels) << kFixpBits, Base::dst_width_);
    
        8/8✓ Branch 0 taken 19838 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 39766 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 20242 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 40614 times.
✓ Branch 7 taken 54 times.

      120664
          for (size_t i = 0;
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      120664
               i < row_interpolation_constants.num_of_vector_paths().two_x; ++i) {
    
            // Repeatedly adding sx_fixp_vector_step is faster than scaling dx to sx,
    
            // but it accumulates fixed-point error; periodic recalibration resets it.
    
            // The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1 <<
    
            // 16). Only the upper 8 bits of the 16-bit fractional part are used for
    
            // interpolation, so once the accumulated error reaches 1 / (1 << 8), it
    
            // can affect later stages. This corresponds to 512 additions. Since two
    
            // additions are performed per cycle, we recalibrate every 256 cycles,
    
            // calculated by this mask.
    
      120460
            constexpr uint64_t kRecalibrateCycleMask = ((1 << 8) - 1);
    
        8/8✓ Branch 0 taken 19742 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 39570 times.
✓ Branch 3 taken 196 times.
✓ Branch 4 taken 20144 times.
✓ Branch 5 taken 98 times.
✓ Branch 6 taken 40414 times.
✓ Branch 7 taken 200 times.

      120460
            if ((i & kRecalibrateCycleMask) == 0) {
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      590
              sx_fixp = Base::to_src_x(dx);
    
      590
            }
    
            // Pull back sx if it would overrun
    
      120460
            uint64_t sx_candidate = sx_fixp >> kFixpBits;
    
      120460
            uint64_t sx_base = std::min(max_sx_2x, sx_candidate);
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      120460
            calculate_indices_fractions_base_2x(
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      120460
                row_interpolation_constants.full_vector_constants_array()[i * 2],
    
      120460
                sx_base, sx_fixp);
    
      120460
            sx_fixp += sx_fixp_vector_step;
    
      120460
            dx += kStep / kChannels;
    
            // Pull back sx if it would overrun
    
      120460
            sx_candidate = sx_fixp >> kFixpBits;
    
      120460
            sx_base = std::min(max_sx_2x, sx_candidate);
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      120460
            calculate_indices_fractions_base_2x(
    
      120460
                row_interpolation_constants
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      120460
                    .full_vector_constants_array()[(i * 2) + 1],
    
      120460
                sx_base, sx_fixp);
    
      120460
            sx_fixp += sx_fixp_vector_step;
    
      120460
            dx += kStep / kChannels;
    
      120460
          }
    
          // Calculate constants for half vectors
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      204
          sx_fixp = Base::to_src_x(dx);
    
          // Difference in source x coordinate for one destination pixel
    
      408
          const uint64_t sx_fixp_one_dst_pixel =
    
      204
              rounding_div(Base::src_width_ << kFixpBits, Base::dst_width_);
    
          // Maximum source coordinate for half vector path
    
      408
          const uint64_t max_sx_half =
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      612
              std::max(Base::src_width_ * kChannels -
    
                           (sizeof(uint8x16_t) * (kRatio - 1)),
    
      408
                       0UL) /
    
              kChannels;
    
          // Maximum destination coordinate for half vector path
    
      204
          const uint64_t max_dx_half = Base::dst_width_ - (kHalfStep / kChannels);
    
          // Difference in source x coordinate for the half vector path
    
      408
          const uint64_t sx_fixp_half_step =
    
      408
              rounding_div((Base::src_width_ * kHalfStep / kChannels) << kFixpBits,
    
      204
                           Base::dst_width_);
    
        8/8✓ Branch 0 taken 94 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 118 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 112 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 160 times.
✓ Branch 7 taken 54 times.

      688
          for (size_t i = 0;
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      688
               i < row_interpolation_constants.num_of_vector_paths().half; ++i) {
    
            // If (dx + half vector length) would overrun the buffer, pull it back
    
      484
            uint64_t dx_pulled_back = std::min(dx, max_dx_half);
    
            // Pull back sx if dx was pulled back
    
      484
            sx_fixp -= (dx - dx_pulled_back) * sx_fixp_one_dst_pixel;
    
      484
            dx = dx_pulled_back;
    
            // If (sx_base + reading length) would overrun the buffer, pull sx back
    
            // again
    
      484
            uint64_t sx_candidate = sx_fixp >> kFixpBits;
    
      484
            uint64_t sx_base = std::min(max_sx_half, sx_candidate);
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      484
            calculate_indices_fractions_base_half(
    
        0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

      484
                row_interpolation_constants.half_vector_constants_array()[i], sx_base,
    
      484
                sx_fixp, dx);
    
      484
            dx += kHalfStep / kChannels;
    
      484
            sx_fixp += sx_fixp_half_step;
    
      484
          }
    
      204
          return row_interpolation_constants_variant;
    
      212
        }
    
       private:
    
      240920
        void calculate_indices_fractions_base_2x(
    
            FullVectorInterpolationConstants &constants, uint64_t sx_base,
    
            uint64_t sx_fixp) {
    
      240920
          uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp - (sx_base << kFixpBits));
    
      240920
          uint32x4_t vfrac = vdupq_n_u32(xfrac0);
    
          // Calculate x coordinate delta from sx_base, the integer part of source x
    
      240920
          uint8x16x2_t vsx_delta_lo, vsx_delta_hi;
    
      240920
          vsx_delta_lo.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_0_, vfrac));
    
      240920
          vsx_delta_lo.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_1_, vfrac));
    
      240920
          vsx_delta_hi.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_2_, vfrac));
    
      240920
          vsx_delta_hi.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_3_, vfrac));
    
      240920
          uint8x8_t idx0 = vqtbl2_u8(vsx_delta_lo, Base::vsidx_tbl_);
    
      240920
          uint8x8_t idx1 = vqtbl2_u8(vsx_delta_hi, Base::vsidx_tbl_);
    
      240920
          uint8x16_t vsx0_idx = vcombine_u8(idx0, idx1);
    
          if constexpr (kChannels > 1) {
    
      160760
            vsx0_idx = vshlq_n_u8(vsx0_idx, kChannels == 4 ? 2 : 1);
    
      160760
            vsx0_idx =
    
      160760
                vaddq_u8(vsx0_idx, vreinterpretq_u8_u32(vdupq_n_u32(
    
                                       kChannels == 4 ? 0x03020100U : 0x01000100)));
    
          }
    
      240920
          vst1q(constants.idx, vsx0_idx);
    
      240920
          uint16x8x2_t vsxfrac;
    
      240920
          vsxfrac.val[0] =
    
      240920
              vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_lo, Base::vsfrac_tbl_));
    
      240920
          vsxfrac.val[1] =
    
      240920
              vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_hi, Base::vsfrac_tbl_));
    
      240920
          VecTraits<uint16_t>::store(vsxfrac, constants.xfrac);
    
      240920
          constants.src_element_index = static_cast<ptrdiff_t>(sx_base * kChannels);
    
      240920
        }
    
      484
        void calculate_indices_fractions_base_half(
    
            HalfVectorInterpolationConstants &constants, uint64_t sx_base,
    
            uint64_t sx_fixp, uint64_t dx) {
    
      484
          uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp - (sx_base << kFixpBits));
    
      484
          uint32x4_t vfrac = vdupq_n_u32(xfrac0);
    
      484
          uint8x16x2_t vsx_delta;
    
      484
          vsx_delta.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_0_, vfrac));
    
      484
          vsx_delta.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_1_, vfrac));
    
      484
          uint8x8_t vsx0_idx = vqtbl2_u8(vsx_delta, Base::vsidx_tbl_);
    
          if constexpr (kChannels > 1) {
    
      278
            vsx0_idx = vshl_n_u8(vsx0_idx, kChannels == 4 ? 2 : 1);
    
      278
            vsx0_idx = vadd_u8(
    
      556
                vsx0_idx, vreinterpret_u8_u32(
    
      278
                              vdup_n_u32(kChannels == 4 ? 0x03020100U : 0x01000100)));
    
          }
    
      484
          vst1(constants.idx, vsx0_idx);
    
      968
          uint16x8_t vsxfrac =
    
      484
              vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta, Base::vsfrac_tbl_));
    
      484
          VecTraits<uint16_t>::store(vsxfrac, constants.xfrac);
    
      484
          constants.src_element_index = static_cast<ptrdiff_t>(sx_base * kChannels);
    
      484
          constants.dst_element_index = static_cast<ptrdiff_t>(dx * kChannels);
    
      484
        }
    
        const uint32x4_t vsx0_0_;
    
        const uint32x4_t vsx0_1_;
    
        const uint32x4_t vsx0_2_;
    
        const uint32x4_t vsx0_3_;
    
      };
    
      template <ptrdiff_t kRatio>
    
      class RowInterpolationConstantsGenerator<kRatio, 3> final
    
          : RowInterpolationConstantsGeneratorBase<kRatio, 3> {
    
       public:
    
        using Base = RowInterpolationConstantsGeneratorBase<kRatio, 3>;
    
      101
        RowInterpolationConstantsGenerator(size_t src_width, size_t dst_width)
    
      101
            : Base{src_width, dst_width},
    
      202
              sx_fixp_one_dst_pixel_{
    
      202
                  rounding_div(src_width << kFixpBits, dst_width)} {}
    
      101
        std::variant<RowInterpolationConstants, kleidicv_error_t> operator()() {
    
      101
          VectorPathNums v{Base::calculate_num_of_vector_paths()};
    
      101
          auto row_interpolation_constants_variant =
    
      101
              RowInterpolationConstants::create(v);
    
        4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.

      101
          if (std::holds_alternative<kleidicv_error_t>(
    
                  row_interpolation_constants_variant)) {
    
            // Creation failed with some error, return with the variant as it is
    
      4
            return row_interpolation_constants_variant;
    
          }
    
      97
          auto &row_interpolation_constants = *std::get_if<RowInterpolationConstants>(
    
              &row_interpolation_constants_variant);
    
      97
          uint64_t dst_element_index = 0;
    
      97
          uint64_t sx_fixp{};
    
          // Calculate constants for full vectors
    
      194
          size_t num_of_full_vector_constants =
    
      97
              row_interpolation_constants.num_of_vector_paths().two_x * 2;
    
        2/4✗ Branch 0 not taken.
✓ Branch 1 taken 43 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.

      97
          if (num_of_full_vector_constants > 0) {
    
      97
            size_t handled_full_vector_paths = 0;
    
        3/4✓ Branch 0 taken 14 times.
✓ Branch 1 taken 29 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.

      97
            if (num_of_full_vector_constants > 3) {
    
      166
              size_t num_of_vector_paths_wout_pullback =
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      83
                  get_num_of_vector_paths_wout_pullback(num_of_full_vector_constants);
    
              // Handle 3 vectors at a time, that way in pixel index is known at
    
              // compile time
    
      166
              size_t vector_path_triplets_wout_pullback =
    
      83
                  num_of_vector_paths_wout_pullback / 3;
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      83
              sx_fixp = Base::to_src_x(0);
    
      83
              unsigned recalibrate_cnt = 0;
    
        4/4✓ Branch 0 taken 39686 times.
✓ Branch 1 taken 29 times.
✓ Branch 2 taken 41138 times.
✓ Branch 3 taken 54 times.

      80907
              for (size_t i = 0; i < vector_path_triplets_wout_pullback; ++i) {
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                const uint32x4x4_t vsx_r = gen_vsx_r();
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                const uint8x16_t vsx_idx_diff_r = gen_vsx_idx_diff_r();
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                const uint32x4x4_t vsx_g = gen_vsx_g();
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                const uint8x16_t vsx_idx_diff_g = gen_vsx_idx_diff_g();
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                const uint32x4x4_t vsx_b = gen_vsx_b();
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                const uint8x16_t vsx_idx_diff_b = gen_vsx_idx_diff_b();
    
                // Difference in source x coordinate for 5 destination pixels
    
      161648
                const uint64_t sx_fixp_five_dst_pixel = rounding_div(
    
      80824
                    (Base::src_width_ * 5) << kFixpBits, Base::dst_width_);
    
                // Difference in source x coordinate for 6 destination pixels
    
      161648
                const uint64_t sx_fixp_six_dst_pixel = rounding_div(
    
      80824
                    (Base::src_width_ * 6) << kFixpBits, Base::dst_width_);
    
                // Repeatedly adding sx_fixp_five_dst_pixel and sx_fixp_six_dst_pixel
    
                // is faster than scaling dx to sx, but it accumulates fixed-point
    
                // error; periodic recalibration resets it. The maximum per-addition
    
                // error of these values is 0.5 / (1 << 16). Only the upper 8
    
                // bits of the 16-bit fractional part are used for interpolation, so
    
                // once the accumulated error reaches 1 / (1 << 8), it can affect
    
                // later stages. This corresponds to 512 additions. Since three
    
                // additions are performed per cycle, we recalibrate every 170 cycles.
    
        4/4✓ Branch 0 taken 228 times.
✓ Branch 1 taken 39458 times.
✓ Branch 2 taken 234 times.
✓ Branch 3 taken 40904 times.

      80824
                if (recalibrate_cnt == 170) {
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      462
                  sx_fixp = Base::to_src_x(dst_element_index / 3);
    
      462
                  recalibrate_cnt = 0;
    
      462
                } else {
    
      80362
                  recalibrate_cnt++;
    
                }
    
      80824
                unsigned in_pixel_index = 0;
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                fill_full_constants_vectorially(
    
      80824
                    row_interpolation_constants
    
      80824
                        .full_vector_constants_array()[handled_full_vector_paths],
    
      80824
                    vsx_r, vsx_idx_diff_r, sx_fixp, in_pixel_index);
    
      80824
                sx_fixp += sx_fixp_five_dst_pixel;
    
      80824
                in_pixel_index = 1;
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                fill_full_constants_vectorially(
    
      80824
                    row_interpolation_constants
    
      80824
                        .full_vector_constants_array()[handled_full_vector_paths + 1],
    
      80824
                    vsx_g, vsx_idx_diff_g, sx_fixp, in_pixel_index);
    
      80824
                sx_fixp += sx_fixp_five_dst_pixel;
    
      80824
                in_pixel_index = 2;
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      80824
                fill_full_constants_vectorially(
    
      80824
                    row_interpolation_constants
    
      80824
                        .full_vector_constants_array()[handled_full_vector_paths + 2],
    
      80824
                    vsx_b, vsx_idx_diff_b, sx_fixp, in_pixel_index);
    
      80824
                sx_fixp += sx_fixp_six_dst_pixel;
    
      80824
                handled_full_vector_paths += 3;
    
      80824
                dst_element_index += kStep * 3;
    
      80824
              }
    
      83
            }
    
        4/4✓ Branch 0 taken 74 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 62 times.
✓ Branch 3 taken 54 times.

      233
            while (handled_full_vector_paths < num_of_full_vector_constants) {
    
      272
              auto &constants =
    
      136
                  row_interpolation_constants
    
      136
                      .full_vector_constants_array()[handled_full_vector_paths];
    
              // Maximum source coordinate for full vector path
    
      272
              const uint64_t max_src_base_index = std::max(
    
      136
                  (Base::src_width_ * kChannels) - (sizeof(uint8x16_t) * kRatio),
    
      136
                  0UL);
    
      136
              uint64_t dx = dst_element_index / kChannels;
    
      136
              unsigned in_pixel_index = dst_element_index % kChannels;
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      136
              sx_fixp = Base::to_src_x(dx);
    
      272
              uint64_t src_element_index =
    
      136
                  ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index;
    
              // Pull back src if it would overrun
    
      272
              uint64_t src_element_base =
    
      136
                  std::min(max_src_base_index, src_element_index);
    
        0/8✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.

      272
              fill_full_constants_scalarly(constants, in_pixel_index,
    
      136
                                           src_element_index, src_element_base,
    
      136
                                           sx_fixp);
    
      136
              handled_full_vector_paths++;
    
      136
              dst_element_index += kStep;
    
      136
            }
    
      97
          }
    
          // Calculate constants for half vectors
    
          // Maximum source coordinate for half vector path
    
      97
          uint64_t half_vector_path_src_read_size =
    
              kChannels == 3 ? sizeof(uint8x16x2_t)
    
                             : (sizeof(uint8x16_t) * (kRatio - 1));
    
      194
          const uint64_t max_src_base_index = std::max(
    
      97
              Base::src_width_ * kChannels - half_vector_path_src_read_size, 0UL);
    
          // Maximum destination coordinate for half vector path
    
      194
          const uint64_t max_dst_index_half =
    
      97
              (Base::dst_width_ * kChannels) - kHalfStep;
    
        4/4✓ Branch 0 taken 87 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 138 times.
✓ Branch 3 taken 54 times.

      322
          for (size_t i = 0;
    
      322
               i < row_interpolation_constants.num_of_vector_paths().half; ++i) {
    
      450
            auto &constants =
    
      225
                row_interpolation_constants.half_vector_constants_array()[i];
    
            // If (dst index + half vector length) would overrun the buffer, pull it
    
            // back
    
      225
            dst_element_index = std::min(dst_element_index, max_dst_index_half);
    
      225
            uint64_t dx = dst_element_index / kChannels;
    
      225
            unsigned in_pixel_index = dst_element_index % kChannels;
    
        0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      225
            sx_fixp = Base::to_src_x(dx);
    
      450
            uint64_t src_element_index =
    
      225
                ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index;
    
            // Pull back src if it would overrun
    
      450
            uint64_t src_element_base =
    
      225
                std::min(max_src_base_index, src_element_index);
    
        0/8✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.

      450
            fill_half_constants_scalarly(constants, dst_element_index, in_pixel_index,
    
      225
                                         src_element_index, src_element_base,
    
      225
                                         sx_fixp);
    
      225
            dst_element_index += kHalfStep;
    
      225
          }
    
      97
          return row_interpolation_constants_variant;
    
      101
        }
    
       private:
    
      83
        size_t get_num_of_vector_paths_wout_pullback(
    
            size_t num_of_full_vector_constants) {
    
      170
          auto vector_needs_pullback = [this](size_t dst_idx) {
    
      87
            unsigned in_pixel_idx = dst_idx % kChannels;
    
      87
            uint64_t dx = dst_idx / kChannels;
    
      87
            uint64_t sx_fixp = Base::to_src_x(dx);
    
      87
            uint64_t src_idx = ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_idx;
    
      174
            return (src_idx + (kStep * kRatio)) > (Base::src_width_ * kChannels);
    
      87
          };
    
        2/4✓ Branch 0 taken 29 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 54 times.
✗ Branch 3 not taken.

      83
          if (num_of_full_vector_constants == 0) {
    
      ✗
            return 0;
    
          }
    
      166
          size_t candidate_last_vector_wout_pullback =
    
      83
              num_of_full_vector_constants - 1;
    
      83
          do {
    
        4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 29 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.

      87
            if (!vector_needs_pullback(candidate_last_vector_wout_pullback * kStep)) {
    
      83
              break;
    
            }
    
      4
            candidate_last_vector_wout_pullback--;
    
        2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.

      4
          } while (candidate_last_vector_wout_pullback > 0);
    
        2/4✓ Branch 0 taken 29 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 54 times.
✗ Branch 3 not taken.

      83
          if (candidate_last_vector_wout_pullback == 0) {
    
      ✗
            if (vector_needs_pullback(candidate_last_vector_wout_pullback * kStep)) {
    
      ✗
              return 0;
    
            }
    
          }
    
      83
          return candidate_last_vector_wout_pullback + 1;
    
      83
        }
    
      80824
        uint32x4x4_t gen_vsx_r() {
    
      161648
          return uint32x4x4_t{
    
      323296
              Base::scale_x(0), Base::scale_x(0), Base::scale_x(0), Base::scale_x(1),
    
      80824
              Base::scale_x(1), Base::scale_x(1), Base::scale_x(2), Base::scale_x(2),
    
      80824
              Base::scale_x(2), Base::scale_x(3), Base::scale_x(3), Base::scale_x(3),
    
      80824
              Base::scale_x(4), Base::scale_x(4), Base::scale_x(4), Base::scale_x(5)};
    
        }
    
      80824
        uint8x16_t gen_vsx_idx_diff_r() {
    
      80824
          return uint8x16_t{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0};
    
        }
    
      80824
        uint32x4x4_t gen_vsx_g() {
    
      161648
          return uint32x4x4_t{
    
      323296
              Base::scale_x(0), Base::scale_x(0), Base::scale_x(1), Base::scale_x(1),
    
      80824
              Base::scale_x(1), Base::scale_x(2), Base::scale_x(2), Base::scale_x(2),
    
      80824
              Base::scale_x(3), Base::scale_x(3), Base::scale_x(3), Base::scale_x(4),
    
      80824
              Base::scale_x(4), Base::scale_x(4), Base::scale_x(5), Base::scale_x(5)};
    
        }
    
      80824
        uint8x16_t gen_vsx_idx_diff_g() {
    
      80824
          return uint8x16_t{0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1};
    
        }
    
      80824
        uint32x4x4_t gen_vsx_b() {
    
      161648
          return uint32x4x4_t{
    
      323296
              Base::scale_x(0), Base::scale_x(1), Base::scale_x(1), Base::scale_x(1),
    
      80824
              Base::scale_x(2), Base::scale_x(2), Base::scale_x(2), Base::scale_x(3),
    
      80824
              Base::scale_x(3), Base::scale_x(3), Base::scale_x(4), Base::scale_x(4),
    
      80824
              Base::scale_x(4), Base::scale_x(5), Base::scale_x(5), Base::scale_x(5)};
    
        }
    
      80824
        uint8x16_t gen_vsx_idx_diff_b() {
    
      80824
          return uint8x16_t{0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
    
        }
    
      242472
        void fill_full_constants_vectorially(
    
            FullVectorInterpolationConstants &constants, uint32x4x4_t vsx,
    
            uint8x16_t vsx_idx_diff, uint64_t sx_fixp, unsigned in_pixel_index) {
    
      484944
          uint64_t src_element_index_base =
    
      242472
              ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index;
    
      242472
          constants.src_element_index =
    
      242472
              static_cast<ptrdiff_t>(src_element_index_base);
    
          // Create x coordinate for all lanes
    
      242472
          uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp & ((1 << kFixpBits) - 1));
    
      242472
          uint32x4_t vfrac = vdupq_n_u32(xfrac0);
    
      242472
          uint8x16x2_t vsx_delta_lo, vsx_delta_hi;
    
      242472
          vsx_delta_lo.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[0], vfrac));
    
      242472
          vsx_delta_lo.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[1], vfrac));
    
      242472
          vsx_delta_hi.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[2], vfrac));
    
      242472
          vsx_delta_hi.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[3], vfrac));
    
          // Get index from coordinate
    
      242472
          uint8x8_t idx0 = vqtbl2_u8(vsx_delta_lo, Base::vsidx_tbl_);
    
      242472
          uint8x8_t idx1 = vqtbl2_u8(vsx_delta_hi, Base::vsidx_tbl_);
    
      242472
          uint8x16_t vsx0_idx = vcombine_u8(idx0, idx1);
    
          // One step in x means 3 steps in elements
    
      242472
          vsx0_idx = vmulq_u8(vsx0_idx, vdupq_n_u8(3));
    
          // Align the stepping if the first lane is green or blue
    
      242472
          vsx0_idx = vqsubq_u8(vsx0_idx, vdupq_n_u8(in_pixel_index));
    
          // Add in-pixel index
    
      242472
          vsx0_idx = vaddq_u8(vsx0_idx, vsx_idx_diff);
    
      242472
          vst1q(constants.idx, vsx0_idx);
    
          // Get fraction from coordinate
    
      242472
          uint16x8x2_t vsxfrac;
    
      242472
          vsxfrac.val[0] =
    
      242472
              vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_lo, Base::vsfrac_tbl_));
    
      242472
          vsxfrac.val[1] =
    
      242472
              vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_hi, Base::vsfrac_tbl_));
    
      242472
          VecTraits<uint16_t>::store(vsxfrac, constants.xfrac);
    
      242472
        }
    
      136
        void fill_full_constants_scalarly(FullVectorInterpolationConstants &constants,
    
                                          unsigned in_pixel_index,
    
                                          uint64_t src_element_index,
    
                                          uint64_t src_element_base,
    
                                          uint64_t sx_fixp) {
    
      136
          constants.src_element_index = static_cast<ptrdiff_t>(src_element_base);
    
      272
          fill_idx_xfrac(constants, in_pixel_index, src_element_index,
    
      136
                         src_element_base, sx_fixp);
    
      136
        }
    
      225
        void fill_half_constants_scalarly(HalfVectorInterpolationConstants &constants,
    
                                          uint64_t dst_element_index,
    
                                          unsigned in_pixel_index,
    
                                          uint64_t src_element_index,
    
                                          uint64_t src_element_base,
    
                                          uint64_t sx_fixp) {
    
      225
          constants.dst_element_index = static_cast<ptrdiff_t>(dst_element_index);
    
      225
          constants.src_element_index = static_cast<ptrdiff_t>(src_element_base);
    
      450
          fill_idx_xfrac(constants, in_pixel_index, src_element_index,
    
      225
                         src_element_base, sx_fixp);
    
      225
        }
    
        template <typename VectorConstants>
    
      361
        void fill_idx_xfrac(VectorConstants &constants, unsigned in_pixel_index,
    
                            uint64_t src_element_index, uint64_t src_element_base,
    
                            uint64_t sx_fixp) {
    
          // For indexing inside idx and xfrac arrays of
    
          // the interpolation constants
    
      361
          unsigned j = 0;
    
      361
          uint8_t idx = (src_element_index - src_element_base);
    
      361
          uint16_t xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2);
    
        8/8✓ Branch 0 taken 185 times.
✓ Branch 1 taken 74 times.
✓ Branch 2 taken 163 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 165 times.
✓ Branch 5 taken 62 times.
✓ Branch 6 taken 279 times.
✓ Branch 7 taken 138 times.

      1153
          for (; j < (kChannels - in_pixel_index); ++j) {
    
      792
            constants.idx[j] = idx + j;
    
      792
            constants.xfrac[j] = xfrac;
    
      792
          }
    
      361
          sx_fixp += sx_fixp_one_dst_pixel_;
    
      361
          src_element_index = (sx_fixp >> kFixpBits) * kChannels;
    
      361
          idx = (src_element_index - src_element_base);
    
      361
          xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2);
    
      361
          constexpr size_t idx_frac_elem_num = sizeof(VectorConstants::idx);
    
        8/8✓ Branch 0 taken 370 times.
✓ Branch 1 taken 74 times.
✓ Branch 2 taken 193 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 310 times.
✓ Branch 5 taken 62 times.
✓ Branch 6 taken 300 times.
✓ Branch 7 taken 138 times.

      1534
          while (j < idx_frac_elem_num) {
    
            // k is the index for the elements in one pixel
    
        16/16✓ Branch 0 taken 74 times.
✓ Branch 1 taken 1295 times.
✓ Branch 2 taken 999 times.
✓ Branch 3 taken 370 times.
✓ Branch 4 taken 87 times.
✓ Branch 5 taken 639 times.
✓ Branch 6 taken 533 times.
✓ Branch 7 taken 193 times.
✓ Branch 8 taken 62 times.
✓ Branch 9 taken 1075 times.
✓ Branch 10 taken 827 times.
✓ Branch 11 taken 310 times.
✓ Branch 12 taken 138 times.
✓ Branch 13 taken 987 times.
✓ Branch 14 taken 825 times.
✓ Branch 15 taken 300 times.

      4357
            for (unsigned k = 0; (j < idx_frac_elem_num) && (k < kChannels);
    
      3184
                 ++j, ++k) {
    
      3184
              constants.idx[j] = idx + k;
    
      3184
              constants.xfrac[j] = xfrac;
    
      3184
            }
    
      1173
            sx_fixp += sx_fixp_one_dst_pixel_;
    
      1173
            src_element_index = (sx_fixp >> kFixpBits) * kChannels;
    
      1173
            idx = (src_element_index - src_element_base);
    
      1173
            xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2);
    
          }
    
      361
        }
    
        static constexpr size_t kChannels = 3;
    
        // Difference in source x coordinate for one destination pixel
    
        const size_t sx_fixp_one_dst_pixel_;
    
      };
    
      template <ptrdiff_t kRatio, ptrdiff_t kChannels,
    
                bool kSetRightmostLanes = false>
    
      class ResizeGenericU8Operation final {
    
       public:
    
      301
        ResizeGenericU8Operation(const uint8_t *src, size_t src_stride,
    
                                 size_t src_height, size_t y_begin, size_t y_end,
    
                                 uint8_t *dst, size_t dst_stride, size_t dst_height)
    
      301
            : src_rows_{src, src_stride, kChannels},
    
      301
              dst_rows_{dst, dst_stride, kChannels},
    
      301
              src_height_{src_height},
    
      301
              y_begin_{y_begin},
    
      301
              y_end_{y_end},
    
      301
              dst_height_{dst_height} {}
    
      301
        void process_rows(RowInterpolationConstants &row_interpolation_constants) {
    
        14/16✓ Branch 0 taken 48 times.
✓ Branch 1 taken 448 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 688 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 43 times.
✓ Branch 7 taken 643 times.
✓ Branch 8 taken 48 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 54 times.
✓ Branch 11 taken 572 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 13 times.
✓ Branch 14 taken 46 times.
✓ Branch 15 taken 536 times.

      3533
          for (uint64_t dst_y = y_begin_; dst_y < y_end_; ++dst_y) {
    
      3232
            process_row(dst_y, row_interpolation_constants);
    
      3232
          }
    
      301
        }
    
       private:
    
      3232
        uint64_t to_src_y(uint64_t dy) const {
    
      3232
          return aligned_scale(dy, src_height_, dst_height_);
    
        }
    
      3232
        void process_row(uint64_t dy,
    
                         RowInterpolationConstants &row_interpolation_constants) {
    
      3232
          VectorPathNums num_of_vector_paths =
    
      3232
              row_interpolation_constants.num_of_vector_paths();
    
      6464
          auto *full_array =
    
      3232
              row_interpolation_constants.full_vector_constants_array();
    
      6464
          auto *half_array =
    
      3232
              row_interpolation_constants.half_vector_constants_array();
    
      3232
          uint64_t sy_fixp = to_src_y(dy);
    
      3232
          ptrdiff_t sy = static_cast<ptrdiff_t>(sy_fixp >> kFixpBits);
    
      3232
          const uint8_t *src_top = &src_rows_.at(sy)[0];
    
      3232
          const uint8_t *src_bottom = &src_rows_.at(sy + 1)[0];
    
      3232
          uint8_t *dst = &dst_rows_.at(static_cast<ptrdiff_t>(dy))[0];
    
          // Get the highest 8 bits of the fractional part
    
          // This is a good compromise between accuracy and performance
    
          // Because the result is 8bits, the error only affects the least
    
          // significant 1-2 bits, see the accuracy calculation in kleidicv.h
    
      6464
          uint16_t yfrac =
    
      3232
              static_cast<uint16_t>((sy_fixp - (sy << kFixpBits)) >> (kFixpBits - 8));
    
      3232
          ptrdiff_t dst_element_index = 0;
    
        14/16✓ Branch 0 taken 21912 times.
✓ Branch 1 taken 448 times.
✓ Branch 2 taken 45764 times.
✓ Branch 3 taken 688 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 68368 times.
✓ Branch 7 taken 643 times.
✓ Branch 8 taken 20864 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 44414 times.
✓ Branch 11 taken 572 times.
✓ Branch 12 taken 1068 times.
✓ Branch 13 taken 13 times.
✓ Branch 14 taken 67072 times.
✓ Branch 15 taken 536 times.

      272694
          for (size_t i = 0; i < num_of_vector_paths.two_x; i += 1) {
    
      269462
            uint8x16x2_t res{};
    
      269462
            res.val[0] = vector_path(full_array[i * 2], src_top, src_bottom, yfrac);
    
      269462
            res.val[1] =
    
      269462
                vector_path(full_array[(i * 2) + 1], src_top, src_bottom, yfrac);
    
      269462
            VecTraits<uint8_t>::store(res, &dst[dst_element_index]);
    
      269462
            dst_element_index += kStep * 2;
    
      269462
          }
    
        14/16✓ Branch 0 taken 448 times.
✓ Branch 1 taken 1086 times.
✓ Branch 2 taken 688 times.
✓ Branch 3 taken 1642 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 643 times.
✓ Branch 7 taken 1505 times.
✓ Branch 8 taken 332 times.
✓ Branch 9 taken 880 times.
✓ Branch 10 taken 572 times.
✓ Branch 11 taken 1494 times.
✓ Branch 12 taken 13 times.
✓ Branch 13 taken 39 times.
✓ Branch 14 taken 536 times.
✓ Branch 15 taken 1620 times.

      11498
          for (size_t i = 0; i < num_of_vector_paths.half; i += 1) {
    
      8266
            auto res = vector_path_half(half_array[i], yfrac, src_top, src_bottom);
    
      8266
            vst1(&dst[half_array[i].dst_element_index], res);
    
      8266
          }
    
      3232
        }
    
      8266
        uint8x8_t vector_path_half(const HalfVectorInterpolationConstants &constants,
    
                                   uint16_t yfrac, const uint8_t *src_top,
    
                                   const uint8_t *src_bottom) const {
    
      8266
          uint8x8_t vsx0_idx = vld1_u8(constants.idx);
    
      8266
          uint8x8_t vsx1_idx = vadd_u8(vsx0_idx, vdup_n_u8(kChannels));
    
      8266
          uint16x8_t vsxfrac;
    
      8266
          VecTraits<uint16_t>::load(constants.xfrac, vsxfrac);
    
      8266
          ptrdiff_t src_element_index = constants.src_element_index;
    
          using SrcVecType = std::conditional_t<kRatio == 2 && kChannels != 3,
    
                                                uint8x16_t, uint8x16x2_t>;
    
      8266
          SrcVecType topsrc, bottomsrc;
    
      8266
          VecTraits<uint8_t>::load(&src_top[src_element_index], topsrc);
    
      8266
          VecTraits<uint8_t>::load(&src_bottom[src_element_index], bottomsrc);
    
      8266
          uint8x8_t a, b, c, d;
    
          if constexpr (kRatio == 2 && kChannels != 3) {
    
      2728
            a = vqtbl1_u8(topsrc, vsx0_idx);
    
      2728
            b = vqtbl1_u8(topsrc, vsx1_idx);
    
      2728
            c = vqtbl1_u8(bottomsrc, vsx0_idx);
    
      2728
            d = vqtbl1_u8(bottomsrc, vsx1_idx);
    
          } else if constexpr (kRatio == 3 || kChannels == 3) {
    
      5538
            a = vqtbl2_u8(topsrc, vsx0_idx);
    
      5538
            b = vqtbl2_u8(topsrc, vsx1_idx);
    
      5538
            c = vqtbl2_u8(bottomsrc, vsx0_idx);
    
      5538
            d = vqtbl2_u8(bottomsrc, vsx1_idx);
    
          }
    
      16532
          uint8x8_t left =
    
      8266
              vraddhn_u16(vshll_n_u8(a, 8), vmulq_n_u16(vsubl_u8(c, a), yfrac));
    
      16532
          uint8x8_t right =
    
      8266
              vraddhn_u16(vshll_n_u8(b, 8), vmulq_n_u16(vsubl_u8(d, b), yfrac));
    
      16532
          uint8x8_t res = vraddhn_u16(vshll_n_u8(left, 8),
    
      8266
                                      vmulq_u16(vsubl_u8(right, left), vsxfrac));
    
      16532
          return res;
    
      8266
        }
    
      538924
        uint8x16_t vector_path(const FullVectorInterpolationConstants &constants,
    
                               const uint8_t *src_top, const uint8_t *src_bottom,
    
                               uint16_t yfrac) const {
    
      538924
          uint8x16_t vsx0_idx = vld1q(constants.idx);
    
      538924
          uint8x16_t vsx1_idx = vaddq_u8(vsx0_idx, vdupq_n_u8(kChannels));
    
      538924
          uint16x8x2_t vsxfrac2;
    
      538924
          VecTraits<uint16_t>::load(constants.xfrac, vsxfrac2);
    
      538924
          ptrdiff_t src_element_index = constants.src_element_index;
    
          using SrcVecType =
    
              std::conditional_t<kRatio == 2, uint8x16x2_t, uint8x16x3_t>;
    
      538924
          SrcVecType topsrc, bottomsrc;
    
      538924
          VecTraits<uint8_t>::load(&src_top[src_element_index], topsrc);
    
      538924
          VecTraits<uint8_t>::load(&src_bottom[src_element_index], bottomsrc);
    
      538924
          uint8x16_t a, b, c, d;
    
          if constexpr (kRatio == 2) {
    
      272088
            a = vqtbl2q_u8(topsrc, vsx0_idx);
    
      272088
            b = vqtbl2q_u8(topsrc, vsx1_idx);
    
      272088
            c = vqtbl2q_u8(bottomsrc, vsx0_idx);
    
      272088
            d = vqtbl2q_u8(bottomsrc, vsx1_idx);
    
            if constexpr (kSetRightmostLanes) {
    
              // table lookup would overindex topsrc and bottomsrc
    
      ✗
              ptrdiff_t last_but_one_right_elem_idx =
    
      ✗
                  src_element_index + constants.idx[14] + kChannels;
    
      ✗
              ptrdiff_t last_right_elem_idx =
    
      ✗
                  src_element_index + constants.idx[15] + kChannels;
    
      ✗
              b = vsetq_lane_u8(src_top[last_but_one_right_elem_idx], b, 14);
    
      ✗
              b = vsetq_lane_u8(src_top[last_right_elem_idx], b, 15);
    
      ✗
              d = vsetq_lane_u8(src_bottom[last_but_one_right_elem_idx], d, 14);
    
      ✗
              d = vsetq_lane_u8(src_bottom[last_right_elem_idx], d, 15);
    
            }
    
          } else if constexpr (kRatio == 3) {
    
      266836
            a = vqtbl3q_u8(topsrc, vsx0_idx);
    
      266836
            b = vqtbl3q_u8(topsrc, vsx1_idx);
    
      266836
            c = vqtbl3q_u8(bottomsrc, vsx0_idx);
    
      266836
            d = vqtbl3q_u8(bottomsrc, vsx1_idx);
    
            // table lookup would overindex topsrc and bottomsrc
    
            if constexpr (kSetRightmostLanes) {
    
      4272
              ptrdiff_t last_right_elem_idx =
    
      2136
                  src_element_index + constants.idx[15] + kChannels;
    
      2136
              b = vsetq_lane_u8(src_top[last_right_elem_idx], b, 15);
    
      2136
              d = vsetq_lane_u8(src_bottom[last_right_elem_idx], d, 15);
    
      2136
            }
    
          }
    
      538924
          uint8x8_t left_lo = lerp_low_half(a, c, yfrac);
    
      538924
          uint8x8_t left_hi = lerp_high_half(a, c, yfrac);
    
      538924
          uint8x8_t right_lo = lerp_low_half(b, d, yfrac);
    
      538924
          uint8x8_t right_hi = lerp_high_half(b, d, yfrac);
    
      538924
          uint8x8_t res_lo = lerp_full(left_lo, right_lo, vsxfrac2.val[0]);
    
      538924
          uint8x8_t res_hi = lerp_full(left_hi, right_hi, vsxfrac2.val[1]);
    
      1077848
          return vcombine_u8(res_lo, res_hi);
    
      538924
        }
    
      1077848
        static uint8x8_t lerp_low_half(uint8x16_t a, uint8x16_t b, uint16_t w) {
    
      1077848
          return vraddhn_u16(
    
      1077848
              vshll_n_u8(vget_low_u8(a), 8),
    
      1077848
              vmulq_n_u16(vsubl_u8(vget_low_u8(b), vget_low_u8(a)), w));
    
        }
    
      1077848
        static uint8x8_t lerp_high_half(uint8x16_t a, uint8x16_t b, uint16_t w) {
    
      2155696
          return vraddhn_u16(vshll_high_n_u8(a, 8),
    
      1077848
                             vmulq_n_u16(vsubl_high_u8(b, a), w));
    
        }
    
      1077848
        static uint8x8_t lerp_full(uint8x8_t a, uint8x8_t b, uint16x8_t w) {
    
      1077848
          return vraddhn_u16(vshll_n_u8(a, 8), vmulq_u16(vsubl_u8(b, a), w));
    
        }
    
        const Rows<const uint8_t> src_rows_;
    
        const Rows<uint8_t> dst_rows_;
    
        const size_t src_height_;
    
        const size_t y_begin_;
    
        const size_t y_end_;
    
        const size_t dst_height_;
    
      };
    
      }  // namespace kleidicv::neon::resize_linear_generic_u8