KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/transform/remap_s16point5_neon.cpp
Date:	2025-09-25 14:13:34
	Exec	Total	Coverage
Lines:	790	790	100.0%
Functions:	67	67	100.0%
Branches:	84	84	100.0%
  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <cassert>
    
      #include "kleidicv/neon.h"
    
      #include "kleidicv/transform/remap.h"
    
      namespace kleidicv::neon {
    
      template <typename ScalarType>
    
      class RemapS16Point5Replicate;
    
      template <>
    
      class RemapS16Point5Replicate<uint8_t> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
        using MapVectorType = typename MapVecTraits::VectorType;
    
        using MapVector2Type = typename MapVecTraits::Vector2Type;
    
        using FracVecTraits = neon::VecTraits<uint16_t>;
    
        using FracVectorType = typename FracVecTraits::VectorType;
    
      67
        RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
    
                                size_t src_height)
    
      67
            : src_rows_{src_rows},
    
      67
              v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
    
      67
              v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
    
      67
              v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
    
      79
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2536
          auto vector_path = [&](size_t step) {
    
      2457
            MapVector2Type xy = vld2q_s16(&mapxy[0]);
    
      2457
            FracVectorType frac = vld1q_u16(&mapfrac[0]);
    
      4914
            uint16x8_t xfrac =
    
      4914
                vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
    
                          // extract xfrac = frac[0:4]
    
      2457
                          vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
    
      4914
            uint16x8_t yfrac =
    
      4914
                vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
    
                          // extract yfrac = frac[5:9]
    
      4914
                          vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
    
      2457
                                    vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
    
      2457
            uint16x8_t nxfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
    
      2457
            uint16x8_t nyfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
    
            // Clamp coordinates to within the dimensions of the source image
    
      4914
            uint16x8_t x0 = vreinterpretq_u16_s16(
    
      2457
                vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
    
      4914
            uint16x8_t y0 = vreinterpretq_u16_s16(
    
      2457
                vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
    
            // x1 = x0 + 1, except if it's already xmax
    
      2457
            uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_));
    
      2457
            uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_));
    
      4914
            uint16x4_t dst_low = load_and_interpolate(
    
      2457
                vmovl_u16(vget_low_u16(x0)), vget_low_u16(y0),
    
      2457
                vmovl_u16(vget_low_u16(x1)), vget_low_u16(y1), vget_low_u16(xfrac),
    
      2457
                vget_low_u16(yfrac), vget_low_u16(nxfrac), vget_low_u16(nyfrac));
    
      4914
            uint16x4_t dst_high = load_and_interpolate(
    
      2457
                vmovl_high_u16(x0), vget_high_u16(y0), vmovl_high_u16(x1),
    
      2457
                vget_high_u16(y1), vget_high_u16(xfrac), vget_high_u16(yfrac),
    
      2457
                vget_high_u16(nxfrac), vget_high_u16(nyfrac));
    
      2457
            vst1_u8(&dst[0], vuzp1_u8(dst_low, dst_high));
    
      2457
            mapxy += ptrdiff_t(step);
    
      2457
            mapfrac += ptrdiff_t(step);
    
      2457
            dst += ptrdiff_t(step);
    
      2457
          };
    
      79
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      79
          loop.unroll_once(vector_path);
    
      158
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      79
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      79
          mapxy -= back_step;
    
      79
          mapfrac -= back_step;
    
      79
          dst -= back_step;
    
      116
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      79
        }
    
       private:
    
      4914
        uint16x4_t load_and_interpolate(uint32x4_t x0, uint16x4_t y0, uint32x4_t x1,
    
                                        uint16x4_t y1, uint16x4_t xfrac,
    
                                        uint16x4_t yfrac, uint16x4_t nxfrac,
    
                                        uint16x4_t nyfrac) {
    
          // Calculate offsets from coordinates (y * stride + x)
    
          // a: top left, b: top right, c: bottom left, d: bottom right
    
      4914
          uint32x4_t offset = vmlal_u16(x0, y0, v_src_stride_);
    
      9828
          uint64_t acc =
    
      9828
              static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
    
      9828
              (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
    
      9828
              (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
    
      4914
              (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
    
      4914
          uint16x4_t a = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
    
      4914
          offset = vmlal_u16(x1, y0, v_src_stride_);
    
      14742
          acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
    
      9828
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
    
      9828
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
    
      4914
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
    
      4914
          uint16x4_t b = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
    
      4914
          uint16x4_t line0 = vmla_u16(vmul_u16(xfrac, b), nxfrac, a);
    
      4914
          offset = vmlal_u16(x0, y1, v_src_stride_);
    
      14742
          acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
    
      9828
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
    
      9828
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
    
      4914
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
    
      4914
          uint16x4_t c = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
    
      9828
          uint32x4_t line0_lerpd = vmlal_u16(
    
      4914
              vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, nyfrac);
    
      4914
          offset = vmlal_u16(x1, y1, v_src_stride_);
    
      14742
          acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
    
      9828
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
    
      9828
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
    
      4914
                (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
    
      4914
          uint16x4_t d = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
    
      4914
          uint16x4_t line1 = vmla_u16(vmul_u16(xfrac, d), nxfrac, c);
    
      9828
          return vshrn_n_u32(vmlal_u16(line0_lerpd, line1, yfrac),
    
                             2 * REMAP16POINT5_FRAC_BITS);
    
      4914
        }
    
        Rows<const ScalarType> src_rows_;
    
        uint16x4_t v_src_stride_;
    
        int16x8_t v_xmax_;
    
        int16x8_t v_ymax_;
    
      };  // end of class RemapS16Point5Replicate<uint8_t>
    
      // Common interpolation function used by all RemapS16Point5 operations except
    
      // 1-channel u8 with replicated borders (RemapS16Point5Replicate<uint8_t>)
    
      // because that processes one half vector in one step
    
      46065
      static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c,
    
                                    uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac,
    
                                    uint16x8_t nxfrac, uint16x8_t nyfrac) {
    
      230325
        auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right,
    
                                         uint16x4_t frac,
    
                                         uint16x4_t nfrac) -> uint32x4_t {
    
      184260
          return vmlal_u16(vmull_u16(nfrac, left), frac, right);
    
        };
    
      138195
        auto interpolate_horizontal_low = [interpolate_horizontal](
    
                                              uint16x8_t left, uint16x8_t right,
    
                                              uint16x8_t frac,
    
                                              uint16x8_t nfrac) -> uint32x4_t {
    
      184260
          return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right),
    
      92130
                                        vget_low_u16(frac), vget_low_u16(nfrac));
    
        };
    
      138195
        auto interpolate_horizontal_high = [interpolate_horizontal](
    
                                               uint16x8_t left, uint16x8_t right,
    
                                               uint16x8_t frac,
    
                                               uint16x8_t nfrac) -> uint32x4_t {
    
      184260
          return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right),
    
      92130
                                        vget_high_u16(frac), vget_high_u16(nfrac));
    
        };
    
        // Offset pixel values by 0.5 before rounding down.
    
      46065
        const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
    
      138195
        auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac,
    
                                        uint32x4_t nfrac) -> uint32x4_t {
    
      92130
          uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac);
    
      184260
          return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
    
      92130
        };
    
      46065
        uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac);
    
      46065
        uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac);
    
      46065
        uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac);
    
      46065
        uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac);
    
      92130
        uint32x4_t lo =
    
      92130
            interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)),
    
      46065
                                 vmovl_u16(vget_low_u16(nyfrac)));
    
      92130
        uint32x4_t hi = interpolate_vertical(
    
      46065
            line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac));
    
        // Discard upper 16 bits of each element (low the precision back to original
    
        // 16 bits)
    
      92130
        uint16x8_t result =
    
      46065
            vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
    
      92130
        return result;
    
      46065
      }
    
      template <>
    
      class RemapS16Point5Replicate<uint16_t> {
    
       public:
    
        using ScalarType = uint16_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
      67
        RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
    
                                size_t src_height)
    
      67
            : src_rows_{src_rows},
    
      134
              v_src_element_stride_{vdupq_n_u16(
    
      67
                  static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
    
      67
              v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
    
      67
              v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))},
    
      67
              xfrac_{vdupq_n_u16(0)},
    
      67
              yfrac_{vdupq_n_u16(0)},
    
      67
              nxfrac_{vdupq_n_u16(0)},
    
      67
              nyfrac_{vdupq_n_u16(0)},
    
      67
              x0_{vdupq_n_s16(0)},
    
      67
              x1_{vdupq_n_s16(0)},
    
      67
              y0_{vdupq_n_s16(0)},
    
      67
              y1_{vdupq_n_s16(0)} {}
    
      79
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2476
          auto vector_path = [&](size_t step) {
    
      2397
            prepare_maps(mapxy, mapfrac);
    
      2397
            transform_pixels(dst);
    
      2397
            mapxy += ptrdiff_t(step);
    
      2397
            mapfrac += ptrdiff_t(step);
    
      2397
            dst += ptrdiff_t(step);
    
      2397
          };
    
      79
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      79
          loop.unroll_once(vector_path);
    
      158
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      79
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      79
          mapxy -= back_step;
    
      79
          mapfrac -= back_step;
    
      79
          dst -= back_step;
    
      116
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      79
        }
    
      2397
        void prepare_maps(Columns<const int16_t> mapxy,
    
                          Columns<const uint16_t> mapfrac) {
    
      2397
          int16x8x2_t xy = vld2q_s16(&mapxy[0]);
    
      2397
          uint16x8_t frac = vld1q_u16(&mapfrac[0]);
    
      2397
          uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
    
      2397
          uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
    
      4794
          xfrac_ = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
    
      2397
                             vandq_u16(frac, frac_mask));
    
      2397
          yfrac_ = vbslq_u16(
    
      2397
              vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
    
      2397
              vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask));
    
      2397
          nxfrac_ = vsubq_u16(frac_max, xfrac_);
    
      2397
          nyfrac_ = vsubq_u16(frac_max, yfrac_);
    
          // Clamp coordinates to within the dimensions of the source image
    
      2397
          x0_ = vreinterpretq_u16_s16(
    
      2397
              vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
    
      2397
          y0_ = vreinterpretq_u16_s16(
    
      2397
              vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
    
          // x1 = x0 + 1, except if it's already xmax
    
      2397
          x1_ = vsubq_u16(x0_, vcltq_s16(xy.val[0], v_xmax_));
    
      2397
          y1_ = vsubq_u16(y0_, vcltq_s16(xy.val[1], v_ymax_));
    
      2397
        }
    
      2397
        void transform_pixels(Columns<uint16_t> dst) {
    
      2397
          uint16x8_t a = load_pixels(x0_, y0_);
    
      2397
          uint16x8_t b = load_pixels(x1_, y0_);
    
      2397
          uint16x8_t c = load_pixels(x0_, y1_);
    
      2397
          uint16x8_t d = load_pixels(x1_, y1_);
    
      4794
          uint16x8_t result =
    
      2397
              interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
    
      2397
          vst1q_u16(&dst[0], result);
    
      2397
        }
    
      9588
        uint16x8_t load_pixels(int16x8_t x, int16x8_t y) {
    
          // Clamp coordinates to within the dimensions of the source image
    
      19176
          uint16x8_t x_clamped =
    
      9588
              vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(x, vdupq_n_s16(0))), v_xmax_);
    
      19176
          uint16x8_t y_clamped =
    
      9588
              vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y, vdupq_n_s16(0))), v_ymax_);
    
          // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
    
      19176
          uint32x4_t indices_low =
    
      19176
              vmlal_u16(vmovl_u16(vget_low_u16(x_clamped)), vget_low_u16(y_clamped),
    
      9588
                        vget_low_u16(v_src_element_stride_));
    
      19176
          uint32x4_t indices_high = vmlal_high_u16(vmovl_high_u16(x_clamped),
    
      9588
                                                   y_clamped, v_src_element_stride_);
    
          // Read pixels from source
    
      86292
          uint16x8_t pixels = {
    
      9588
              src_rows_[vgetq_lane_u32(indices_low, 0)],
    
      9588
              src_rows_[vgetq_lane_u32(indices_low, 1)],
    
      9588
              src_rows_[vgetq_lane_u32(indices_low, 2)],
    
      9588
              src_rows_[vgetq_lane_u32(indices_low, 3)],
    
      9588
              src_rows_[vgetq_lane_u32(indices_high, 0)],
    
      9588
              src_rows_[vgetq_lane_u32(indices_high, 1)],
    
      9588
              src_rows_[vgetq_lane_u32(indices_high, 2)],
    
      9588
              src_rows_[vgetq_lane_u32(indices_high, 3)],
    
          };
    
      19176
          return pixels;
    
      9588
        }
    
       private:
    
        Rows<const ScalarType> src_rows_;
    
        uint16x8_t v_src_element_stride_;
    
        int16x8_t v_xmax_;
    
        int16x8_t v_ymax_;
    
        uint16x8_t xfrac_;
    
        uint16x8_t yfrac_;
    
        uint16x8_t nxfrac_;
    
        uint16x8_t nyfrac_;
    
        int16x8_t x0_;
    
        int16x8_t x1_;
    
        int16x8_t y0_;
    
        int16x8_t y1_;
    
      };  // end of class RemapS16Point5Replicate<uint16_t>
    
      template <typename ScalarType>
    
      class RemapS16Point5ConstantBorder;
    
      template <>
    
      class RemapS16Point5ConstantBorder<uint8_t> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
      66
        RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
    
                                     size_t src_width, size_t src_height,
    
                                     const ScalarType *border_value)
    
      66
            : src_rows_{src_rows},
    
      66
              v_src_stride_{vdupq_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
    
      66
              v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
    
      66
              v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
    
      66
              v_border_{vdupq_n_u16(static_cast<uint16_t>(*border_value))} {}
    
      78
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2534
          auto vector_path = [&](size_t step) {
    
      2456
            int16x8x2_t xy = vld2q_s16(&mapxy[0]);
    
      2456
            uint16x8_t frac = vld1q_u16(&mapfrac[0]);
    
      2456
            uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
    
      2456
            uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
    
      2456
            uint16x8_t xfrac = vandq_u16(frac, frac_mask);
    
      4912
            uint16x8_t yfrac =
    
      2456
                vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
    
      2456
            uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac);
    
      2456
            uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac);
    
      2456
            uint16x8_t one = vdupq_n_u16(1);
    
      2456
            uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]);
    
      2456
            uint16x8_t y0 = vreinterpretq_u16_s16(xy.val[1]);
    
      2456
            uint16x8_t x1 = vaddq_u16(x0, one);
    
      2456
            uint16x8_t y1 = vaddq_u16(y0, one);
    
      4912
            uint16x8_t a = load_pixels_or_constant_border(
    
      2456
                src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0);
    
      4912
            uint16x8_t b = load_pixels_or_constant_border(
    
      2456
                src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0);
    
      4912
            uint16x8_t c = load_pixels_or_constant_border(
    
      2456
                src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1);
    
      4912
            uint16x8_t d = load_pixels_or_constant_border(
    
      2456
                src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1);
    
      2456
            uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac);
    
      2456
            vst1_u8(&dst[0], vqmovn_u16(result));
    
      2456
            mapxy += ptrdiff_t(step);
    
      2456
            mapfrac += ptrdiff_t(step);
    
      2456
            dst += ptrdiff_t(step);
    
      2456
          };
    
      78
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      78
          loop.unroll_once(vector_path);
    
      156
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      78
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      78
          mapxy -= back_step;
    
      78
          mapfrac -= back_step;
    
      78
          dst -= back_step;
    
      115
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      78
        }
    
       private:
    
      9824
        uint16x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_,
    
                                                  uint16x8_t v_src_element_stride_,
    
                                                  uint16x8_t v_width_,
    
                                                  uint16x8_t v_height_,
    
                                                  uint16x8_t v_border_, uint16x8_t x,
    
                                                  uint16x8_t y) {
    
          // Find whether coordinates are within the image dimensions.
    
          // Negative coordinates are interpreted as large values due to the s16->u16
    
          // reinterpretation.
    
      19648
          uint16x8_t in_range =
    
      19648
              vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
    
      9824
                        vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
    
          // Zero out-of-range coordinates.
    
      9824
          x = vandq_u16(in_range, x);
    
      9824
          y = vandq_u16(in_range, y);
    
          // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
    
      19648
          uint32x4_t indices_low =
    
      19648
              vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
    
      9824
                        vget_low_u16(v_src_element_stride_));
    
      19648
          uint32x4_t indices_high =
    
      9824
              vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
    
          // Read pixels from source
    
      88416
          uint8x8_t pixels = {
    
      9824
              src_rows_[vgetq_lane_u32(indices_low, 0)],
    
      9824
              src_rows_[vgetq_lane_u32(indices_low, 1)],
    
      9824
              src_rows_[vgetq_lane_u32(indices_low, 2)],
    
      9824
              src_rows_[vgetq_lane_u32(indices_low, 3)],
    
      9824
              src_rows_[vgetq_lane_u32(indices_high, 0)],
    
      9824
              src_rows_[vgetq_lane_u32(indices_high, 1)],
    
      9824
              src_rows_[vgetq_lane_u32(indices_high, 2)],
    
      9824
              src_rows_[vgetq_lane_u32(indices_high, 3)],
    
          };
    
          // Select between source pixels and border colour
    
      19648
          return vbslq_u16(in_range, vmovl_u8(pixels), v_border_);
    
      9824
        }
    
        Rows<const ScalarType> src_rows_;
    
        uint16x8_t v_src_stride_;
    
        uint16x8_t v_width_;
    
        uint16x8_t v_height_;
    
        uint16x8_t v_border_;
    
      };  // end of class RemapS16Point5ConstantBorder<uint8_t>
    
      template <>
    
      class RemapS16Point5ConstantBorder<uint16_t> {
    
       public:
    
        using ScalarType = uint16_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
      66
        RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
    
                                     size_t src_width, size_t src_height,
    
                                     const ScalarType *border_value)
    
      66
            : src_rows_{src_rows},
    
      132
              v_src_element_stride_{vdupq_n_u16(
    
      66
                  static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
    
      66
              v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
    
      66
              v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
    
      66
              v_border_{vdupq_n_u16(*border_value)},
    
      66
              xfrac_{vdupq_n_u16(0)},
    
      66
              yfrac_{vdupq_n_u16(0)},
    
      66
              nxfrac_{vdupq_n_u16(0)},
    
      66
              nyfrac_{vdupq_n_u16(0)},
    
      66
              x0_{vdupq_n_s16(0)},
    
      66
              x1_{vdupq_n_s16(0)},
    
      66
              y0_{vdupq_n_s16(0)},
    
      66
              y1_{vdupq_n_s16(0)} {}
    
      2396
        void prepare_maps(Columns<const int16_t> mapxy,
    
                          Columns<const uint16_t> mapfrac) {
    
      2396
          int16x8x2_t xy = vld2q_s16(&mapxy[0]);
    
      2396
          uint16x8_t frac = vld1q_u16(&mapfrac[0]);
    
      2396
          uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
    
      2396
          uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
    
      2396
          xfrac_ = vandq_u16(frac, frac_mask);
    
      2396
          yfrac_ = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
    
      2396
          nxfrac_ = vsubq_u16(frac_max, xfrac_);
    
      2396
          nyfrac_ = vsubq_u16(frac_max, yfrac_);
    
      2396
          uint16x8_t one = vdupq_n_u16(1);
    
      2396
          x0_ = xy.val[0];
    
      2396
          y0_ = xy.val[1];
    
      2396
          x1_ = vaddq_u16(x0_, one);
    
      2396
          y1_ = vaddq_u16(y0_, one);
    
      2396
        }
    
      78
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2474
          auto vector_path = [&](size_t step) {
    
      2396
            prepare_maps(mapxy, mapfrac);
    
      2396
            transform_pixels(dst);
    
      2396
            mapxy += ptrdiff_t(step);
    
      2396
            mapfrac += ptrdiff_t(step);
    
      2396
            dst += ptrdiff_t(step);
    
      2396
          };
    
      78
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      78
          loop.unroll_once(vector_path);
    
      156
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      78
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      78
          mapxy -= back_step;
    
      78
          mapfrac -= back_step;
    
      78
          dst -= back_step;
    
      115
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      78
        }
    
      2396
        void transform_pixels(Columns<uint16_t> dst) {
    
      2396
          uint16x8_t a = load_pixels(x0_, y0_);
    
      2396
          uint16x8_t b = load_pixels(x1_, y0_);
    
      2396
          uint16x8_t c = load_pixels(x0_, y1_);
    
      2396
          uint16x8_t d = load_pixels(x1_, y1_);
    
      4792
          uint16x8_t result =
    
      2396
              interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
    
      2396
          vst1q_u16(&dst[0], result);
    
      2396
        }
    
      9584
        uint16x8_t load_pixels(uint16x8_t x, uint16x8_t y) {
    
          // Find whether coordinates are within the image dimensions.
    
          // Negative coordinates are interpreted as large values due to the s16->u16
    
          // reinterpretation.
    
      19168
          uint16x8_t in_range =
    
      19168
              vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
    
      9584
                        vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
    
          // Zero out-of-range coordinates.
    
      9584
          x = vandq_u16(in_range, x);
    
      9584
          y = vandq_u16(in_range, y);
    
          // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
    
      19168
          uint32x4_t indices_low =
    
      19168
              vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
    
      9584
                        vget_low_u16(v_src_element_stride_));
    
      19168
          uint32x4_t indices_high =
    
      9584
              vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
    
          // Read pixels from source
    
      86256
          uint16x8_t pixels = {
    
      9584
              src_rows_[vgetq_lane_u32(indices_low, 0)],
    
      9584
              src_rows_[vgetq_lane_u32(indices_low, 1)],
    
      9584
              src_rows_[vgetq_lane_u32(indices_low, 2)],
    
      9584
              src_rows_[vgetq_lane_u32(indices_low, 3)],
    
      9584
              src_rows_[vgetq_lane_u32(indices_high, 0)],
    
      9584
              src_rows_[vgetq_lane_u32(indices_high, 1)],
    
      9584
              src_rows_[vgetq_lane_u32(indices_high, 2)],
    
      9584
              src_rows_[vgetq_lane_u32(indices_high, 3)],
    
          };
    
          // Select between source pixels and border colour
    
      19168
          return vbslq_u16(in_range, pixels, v_border_);
    
      9584
        }
    
       private:
    
        Rows<const ScalarType> src_rows_;
    
        uint16x8_t v_src_element_stride_;
    
        uint16x8_t v_width_;
    
        uint16x8_t v_height_;
    
        uint16x8_t v_border_;
    
        uint16x8_t xfrac_;
    
        uint16x8_t yfrac_;
    
        uint16x8_t nxfrac_;
    
        uint16x8_t nyfrac_;
    
        int16x8_t x0_;
    
        int16x8_t x1_;
    
        int16x8_t y0_;
    
        int16x8_t y1_;
    
      };  // end of class RemapS16Point5ConstantBorder<uint16_t>
    
      9704
      inline void get_coordinates(Columns<const int16_t> mapxy,
    
                                  Columns<const uint16_t> mapfrac, uint16x8_t &x,
    
                                  uint16x8_t &y, uint16x8_t &xfrac,
    
                                  uint16x8_t &yfrac) {
    
      9704
        int16x8x2_t xy = vld2q_s16(&mapxy[0]);
    
      9704
        x = xy.val[0];
    
      9704
        y = xy.val[1];
    
      9704
        uint16x8_t frac = vld1q_u16(&mapfrac[0]);
    
      9704
        xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
    
      19408
        yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
    
      9704
                          vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
    
      9704
      }
    
      19408
      inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1,
    
                                  uint16x4_t y1, uint32x4_t &offsets_a,
    
                                  uint32x4_t &offsets_b, uint32x4_t &offsets_c,
    
                                  uint32x4_t &offsets_d,
    
                                  uint16x4_t v_src_element_stride) {
    
        // Multiply by 4 because of channels
    
      19408
        uint32x4_t x0_scaled = vshll_n_u16(x0, 2);
    
      19408
        uint32x4_t x1_scaled = vshll_n_u16(x1, 2);
    
        // Calculate offsets from coordinates (y * element_stride + x)
    
        // a: top left, b: top right, c: bottom left, d: bottom right
    
      19408
        offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride);
    
      19408
        offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride);
    
      19408
        offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride);
    
      19408
        offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride);
    
      19408
      }
    
      inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low,
    
                                                    uint8_t frac_high) {
    
        uint8x8_t frac_low_high = {frac_low,  frac_low,  frac_low,  frac_low,
    
                                   frac_high, frac_high, frac_high, frac_high};
    
        return vmovl_u8(frac_low_high);
    
      }
    
      157184
      inline uint64_t load_32bit(const uint8_t *src) {
    
      157184
        uint32_t value = 0;
    
      157184
        memcpy(&value, src, sizeof(uint32_t));
    
      314368
        return static_cast<uint64_t>(value);
    
      157184
      }
    
      39296
      inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows,
    
                                     uint32x4_t offsets) {
    
      78592
        uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) |
    
      39296
                            (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32);
    
      78592
        uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) |
    
      39296
                            (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32);
    
      78592
        return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23));
    
      39296
      }
    
      4912
      inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) {
    
        using ScalarType = uint8_t;
    
      4912
        neon::VecTraits<ScalarType>::store(res, &dst[0]);
    
      4912
      }
    
      76672
      inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows,
    
                                     uint32x2_t offsets) {
    
      153344
        return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]),
    
      76672
                        vld1_u16(&src_rows[vget_lane_u32(offsets, 1)]));
    
      }
    
      4792
      inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) {
    
        using ScalarType = uint16_t;
    
      4792
        neon::VecTraits<ScalarType>::store(res, &dst[0]);
    
      4792
      }
    
      // Replicate border specific functions
    
      4852
      inline void get_coordinates_replicate(Columns<const int16_t> mapxy,
    
                                            Columns<const uint16_t> mapfrac,
    
                                            uint16x8_t &x0, uint16x8_t &y0,
    
                                            uint16x8_t &x1, uint16x8_t &y1,
    
                                            uint16x8_t &xfrac, uint16x8_t &yfrac,
    
                                            int16x8_t v_xmax, int16x8_t v_ymax) {
    
      4852
        get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
    
        // Zero the xfrac (or yfrac) if x (or y) are below zero
    
      4852
        xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac);
    
      4852
        yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac);
    
        // Clamp coordinates to within the dimensions of the source image
    
      4852
        x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax)));
    
      4852
        y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax)));
    
        // x1 = x0 + 1, except if it's already xmax
    
      4852
        x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax));
    
      4852
        y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax));
    
      4852
      }
    
      4912
      inline void load_pixels_u8_4ch_replicate(
    
          Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
    
          uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b,
    
          uint8x16_t &c, uint8x16_t &d) {
    
      4912
        a = load_4px_4ch(src_rows, offsets_a);
    
      4912
        b = load_4px_4ch(src_rows, offsets_b);
    
      4912
        c = load_4px_4ch(src_rows, offsets_c);
    
      4912
        d = load_4px_4ch(src_rows, offsets_d);
    
      4912
      }
    
      4792
      inline void load_pixels_u16_4ch_replicate(
    
          Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
    
          uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo,
    
          uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo,
    
          uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) {
    
      4792
        a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
    
      4792
        b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
    
      4792
        c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
    
      4792
        d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
    
      4792
        a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
    
      4792
        b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
    
      4792
        c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
    
      4792
        d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
    
      4792
      }
    
      template <typename ScalarType>
    
      class RemapS16Point5Replicate4ch;
    
      template <>
    
      class RemapS16Point5Replicate4ch<uint8_t> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
      66
        RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
    
                                   size_t src_height)
    
      66
            : src_rows_{src_rows},
    
      66
              v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
    
      66
              v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
    
      66
              v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
    
      78
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2534
          auto vector_path = [&](size_t step) {
    
      2456
            uint16x8_t x0, y0, x1, y1;
    
      2456
            uint16x8_t xfrac, yfrac;
    
      4912
            get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
    
      2456
                                      v_xmax_, v_ymax_);
    
      2456
            uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
    
      2456
            uint8x16_t a, b, c, d;
    
      2456
            uint8x16x2_t res;
    
      4912
            get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
    
      2456
                            vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2456
                            offsets_d, v_src_stride_);
    
      4912
            load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
    
      2456
                                         offsets_d, a, b, c, d);
    
            // Doubled fractions 001122..., low part
    
      2456
            uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
    
      2456
            uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
    
      4912
            uint16x8_t nxfrac2 =
    
      2456
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      4912
            uint16x8_t nyfrac2 =
    
      2456
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      4912
            uint16x8_t res0 = interpolate(
    
      2456
                vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
    
      2456
                vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
    
      2456
                vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
    
      4912
            uint16x8_t res1 = interpolate(
    
      2456
                vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
    
      2456
                vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
    
      2456
                vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
    
      2456
            res.val[0] =
    
      2456
                vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
    
      4912
            get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
    
      2456
                            vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2456
                            offsets_d, v_src_stride_);
    
      4912
            load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
    
      2456
                                         offsets_d, a, b, c, d);
    
            // Doubled fractions 001122..., high part
    
      2456
            xfrac2 = vzip2q(xfrac, xfrac);
    
      2456
            yfrac2 = vzip2q(yfrac, yfrac);
    
      2456
            nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      2456
            nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      4912
            res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
    
      2456
                               vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
    
      2456
                               vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
    
      2456
                               vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
    
      4912
            res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
    
      2456
                               vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
    
      2456
                               vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
    
      2456
                               vzip2q(nyfrac2, nyfrac2));
    
      2456
            res.val[1] =
    
      2456
                vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
    
      2456
            store_pixels_u8_4ch(res, dst);
    
      2456
            mapxy += ptrdiff_t(step);
    
      2456
            mapfrac += ptrdiff_t(step);
    
      2456
            dst += ptrdiff_t(step);
    
      2456
          };
    
      78
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      78
          loop.unroll_once(vector_path);
    
      156
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      78
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      78
          mapxy -= back_step;
    
      78
          mapfrac -= back_step;
    
      78
          dst -= back_step;
    
      115
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      78
        }
    
       private:
    
        Rows<const ScalarType> src_rows_;
    
        uint16x4_t v_src_stride_;
    
        int16x8_t v_xmax_;
    
        int16x8_t v_ymax_;
    
      };  // end of class RemapS16Point5Replicate4ch<uint8_t>
    
      template <>
    
      class RemapS16Point5Replicate4ch<uint16_t> {
    
       public:
    
        using ScalarType = uint16_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
      66
        RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
    
                                   size_t src_height)
    
      66
            : src_rows_{src_rows},
    
      132
              v_src_element_stride_{vdup_n_u16(
    
      66
                  static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
    
      66
              v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
    
      66
              v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
    
      78
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2474
          auto vector_path = [&](size_t step) {
    
      2396
            uint16x8_t x0, y0, x1, y1;
    
      2396
            uint16x8_t xfrac, yfrac;
    
      4792
            get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
    
      2396
                                      v_xmax_, v_ymax_);
    
      2396
            uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
    
      2396
            uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
    
      2396
            uint16x8x4_t res;
    
      4792
            get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
    
      2396
                            vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2396
                            offsets_d, v_src_element_stride_);
    
      4792
            load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
    
      2396
                                          offsets_d, a_low, a_high, b_low, b_high,
    
                                          c_low, c_high, d_low, d_high);
    
            // Doubled fractions 001122..., low part
    
      2396
            uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
    
      2396
            uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
    
      4792
            uint16x8_t nxfrac2 =
    
      2396
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      4792
            uint16x8_t nyfrac2 =
    
      2396
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      2396
            res.val[0] =
    
      4792
                interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
    
      2396
                            vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
    
      2396
                            vzip1q(nyfrac2, nyfrac2));
    
      2396
            res.val[1] =
    
      4792
                interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
    
      2396
                            vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
    
      2396
                            vzip2q(nyfrac2, nyfrac2));
    
      4792
            get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
    
      2396
                            vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2396
                            offsets_d, v_src_element_stride_);
    
      4792
            load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
    
      2396
                                          offsets_d, a_low, a_high, b_low, b_high,
    
                                          c_low, c_high, d_low, d_high);
    
            // Doubled fractions 001122..., high part
    
      2396
            xfrac2 = vzip2q(xfrac, xfrac);
    
      2396
            yfrac2 = vzip2q(yfrac, yfrac);
    
      2396
            nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      2396
            nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      2396
            res.val[2] =
    
      4792
                interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
    
      2396
                            vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
    
      2396
                            vzip1q(nyfrac2, nyfrac2));
    
      2396
            res.val[3] =
    
      4792
                interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
    
      2396
                            vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
    
      2396
                            vzip2q(nyfrac2, nyfrac2));
    
      2396
            store_pixels_u16_4ch(res, dst);
    
      2396
            mapxy += ptrdiff_t(step);
    
      2396
            mapfrac += ptrdiff_t(step);
    
      2396
            dst += ptrdiff_t(step);
    
      2396
          };
    
      78
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      78
          loop.unroll_once(vector_path);
    
      156
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      78
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      78
          mapxy -= back_step;
    
      78
          mapfrac -= back_step;
    
      78
          dst -= back_step;
    
      115
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      78
        }
    
       private:
    
        Rows<const ScalarType> src_rows_;
    
        uint16x4_t v_src_element_stride_;
    
        int16x8_t v_xmax_;
    
        int16x8_t v_ymax_;
    
      };  // end of class RemapS16Point5Replicate4ch<uint16_t>
    
      // Constant border specific functions
    
      4852
      inline void get_coordinates_constant(
    
          Columns<const int16_t> mapxy, Columns<const uint16_t> mapfrac,
    
          uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0,
    
          uint16x8_t &x1, uint16x8_t &y1, uint16x8_t &xfrac, uint16x8_t &yfrac,
    
          uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c,
    
          uint16x8_t &in_range_d) {
    
      4852
        get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
    
      4852
        uint16x8_t one = vdupq_n_u16(1);
    
      4852
        x1 = vaddq_u16(x0, one);
    
      4852
        y1 = vaddq_u16(y0, one);
    
      4852
        uint16x8_t x0_in_range = vcltq_u16(x0, v_width);
    
      4852
        uint16x8_t y0_in_range = vcltq_u16(y0, v_height);
    
      4852
        uint16x8_t x1_in_range = vcltq_u16(x1, v_width);
    
      4852
        uint16x8_t y1_in_range = vcltq_u16(y1, v_height);
    
      4852
        in_range_a = vandq(x0_in_range, y0_in_range);
    
      4852
        in_range_b = vandq(x1_in_range, y0_in_range);
    
      4852
        in_range_c = vandq(x0_in_range, y1_in_range);
    
      4852
        in_range_d = vandq(x1_in_range, y1_in_range);
    
      4852
      }
    
      38816
      inline uint32x4_t zero_out_of_range_offsets(uint32x4_t in_range,
    
                                                  uint32x4_t offsets) {
    
      38816
        return vbslq_u32(in_range, offsets, vdupq_n_u32(0));
    
      }
    
      19648
      inline uint8x16_t replace_pixel_with_border_u8_4ch(uint32x4_t in_range,
    
                                                         uint8x16_t pixels,
    
                                                         uint8x16_t v_border) {
    
      19648
        return vreinterpretq_u8_u32(
    
      19648
            vbslq_u32(in_range, vreinterpretq_u32_u8(pixels), v_border));
    
      }
    
      38336
      inline uint16x8_t replace_pixel_with_border_u16_4ch(uint64x2_t in_range,
    
                                                          uint16x8_t pixels,
    
                                                          uint16x8_t v_border) {
    
      38336
        return vreinterpretq_u16_u64(
    
      38336
            vbslq_u64(in_range, vreinterpretq_u64_u16(pixels), v_border));
    
      }
    
      4912
      inline void load_pixels_u8_4ch_constant(
    
          Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
    
          uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
    
          uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
    
          uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c,
    
          uint8x16_t &d) {
    
      4912
        offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
    
      4912
        offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
    
      4912
        offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
    
      4912
        offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
    
      4912
        a = load_4px_4ch(src_rows, offsets_a);
    
      4912
        b = load_4px_4ch(src_rows, offsets_b);
    
      4912
        c = load_4px_4ch(src_rows, offsets_c);
    
      4912
        d = load_4px_4ch(src_rows, offsets_d);
    
      4912
        a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border);
    
      4912
        b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border);
    
      4912
        c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border);
    
      4912
        d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border);
    
      4912
      }
    
      4792
      inline void load_pixels_u16_4ch_constant(
    
          Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
    
          uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
    
          uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
    
          uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo,
    
          uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo,
    
          uint16x8_t &d_hi) {
    
      4792
        offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
    
      4792
        offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
    
      4792
        offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
    
      4792
        offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
    
      4792
        a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
    
      4792
        b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
    
      4792
        c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
    
      4792
        d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
    
        // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
    
      23960
        auto low32_to_u64 = [](uint32x4_t bitset) {
    
      19168
          return vreinterpretq_u64_s64(
    
      19168
              vmovl_s32(vreinterpret_s32_u32(vget_low_u32(bitset))));
    
        };
    
      9584
        a_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_a), a_lo,
    
      4792
                                                 v_border);
    
      9584
        b_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_b), b_lo,
    
      4792
                                                 v_border);
    
      9584
        c_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_c), c_lo,
    
      4792
                                                 v_border);
    
      9584
        d_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_d), d_lo,
    
      4792
                                                 v_border);
    
      4792
        a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
    
      4792
        b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
    
      4792
        c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
    
      4792
        d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
    
        // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
    
      23960
        auto hi32_to_u64 = [](uint32x4_t bitset) {
    
      19168
          return vreinterpretq_u64_s64(vmovl_high_s32(vreinterpretq_s32_u32(bitset)));
    
        };
    
      9584
        a_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_a), a_hi,
    
      4792
                                                 v_border);
    
      9584
        b_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_b), b_hi,
    
      4792
                                                 v_border);
    
      9584
        c_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_c), c_hi,
    
      4792
                                                 v_border);
    
      9584
        d_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_d), d_hi,
    
      4792
                                                 v_border);
    
      4792
      }
    
      // Convert bitsets such as in_range to 32bits, making all 1s or all 0s
    
      19408
      static uint32x4_t low16_to_s32(uint16x8_t bitset) {
    
      19408
        return vreinterpretq_u32_s32(
    
      19408
            vmovl_s16(vreinterpret_s16_u16(vget_low_u16(bitset))));
    
      }
    
      19408
      static uint32x4_t hi16_to_s32(uint16x8_t bitset) {
    
      19408
        return vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(bitset)));
    
      }
    
      template <typename ScalarType>
    
      class RemapS16Point5Constant4ch;
    
      template <>
    
      class RemapS16Point5Constant4ch<uint8_t> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
      66
        RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
    
                                  size_t src_height, const ScalarType *border_value)
    
      66
            : src_rows_{src_rows},
    
      66
              v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
    
      66
              v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
    
      66
              v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
    
      66
              v_border_{} {
    
      66
          uint32_t border_value_32{};
    
      66
          memcpy(&border_value_32, border_value, sizeof(uint32_t));
    
      66
          v_border_ = vreinterpretq_u8_u32(vdupq_n_u32(border_value_32));
    
      66
        }
    
      78
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2534
          auto vector_path = [&](size_t step) {
    
      2456
            uint16x8_t x0, y0, x1, y1;
    
      2456
            uint16x8_t xfrac, yfrac;
    
      2456
            uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
    
      2456
            get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
    
                                     y1, xfrac, yfrac, in_range_a, in_range_b,
    
                                     in_range_c, in_range_d);
    
      2456
            uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
    
      2456
            uint8x16_t a, b, c, d;
    
      2456
            uint8x16x2_t res;
    
      4912
            get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
    
      2456
                            vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2456
                            offsets_d, v_src_stride_);
    
      2456
            load_pixels_u8_4ch_constant(
    
      2456
                src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
    
      2456
                low16_to_s32(in_range_a), low16_to_s32(in_range_b),
    
      2456
                low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a, b,
    
                c, d);
    
            // Doubled fractions 001122..., low part
    
      2456
            uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
    
      2456
            uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
    
      4912
            uint16x8_t nxfrac2 =
    
      2456
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      4912
            uint16x8_t nyfrac2 =
    
      2456
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      4912
            uint16x8_t res0 = interpolate(
    
      2456
                vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
    
      2456
                vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
    
      2456
                vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
    
      4912
            uint16x8_t res1 = interpolate(
    
      2456
                vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
    
      2456
                vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
    
      2456
                vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
    
      2456
            res.val[0] =
    
      2456
                vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
    
      4912
            get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
    
      2456
                            vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2456
                            offsets_d, v_src_stride_);
    
      2456
            load_pixels_u8_4ch_constant(
    
      2456
                src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
    
      2456
                hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
    
      2456
                hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a, b, c,
    
                d);
    
            // Doubled fractions 001122..., high part
    
      2456
            xfrac2 = vzip2q(xfrac, xfrac);
    
      2456
            yfrac2 = vzip2q(yfrac, yfrac);
    
      2456
            nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      2456
            nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      4912
            res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
    
      2456
                               vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
    
      2456
                               vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
    
      2456
                               vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
    
      4912
            res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
    
      2456
                               vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
    
      2456
                               vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
    
      2456
                               vzip2q(nyfrac2, nyfrac2));
    
      2456
            res.val[1] =
    
      2456
                vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
    
      2456
            store_pixels_u8_4ch(res, dst);
    
      2456
            mapxy += ptrdiff_t(step);
    
      2456
            mapfrac += ptrdiff_t(step);
    
      2456
            dst += ptrdiff_t(step);
    
      2456
          };
    
      78
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      78
          loop.unroll_once(vector_path);
    
      156
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      78
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      78
          mapxy -= back_step;
    
      78
          mapfrac -= back_step;
    
      78
          dst -= back_step;
    
      115
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      78
        }
    
       private:
    
        Rows<const ScalarType> src_rows_;
    
        uint16x4_t v_src_stride_;
    
        uint16x8_t v_width_;
    
        uint16x8_t v_height_;
    
        uint8x16_t v_border_;
    
      };  // end of class RemapS16Point5Constant4ch<uint8_t>
    
      template <>
    
      class RemapS16Point5Constant4ch<uint16_t> {
    
       public:
    
        using ScalarType = uint16_t;
    
        using MapVecTraits = neon::VecTraits<int16_t>;
    
      66
        RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
    
                                  size_t src_height, const ScalarType *border_value)
    
      66
            : src_rows_{src_rows},
    
      132
              v_src_element_stride_{vdup_n_u16(
    
      66
                  static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
    
      66
              v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
    
      66
              v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
    
      66
              v_border_{} {
    
      66
          uint64_t border_value_64{};
    
      66
          memcpy(&border_value_64, border_value, sizeof(uint64_t));
    
      66
          v_border_ = vreinterpretq_u16_u64(vdupq_n_u64(border_value_64));
    
      66
        }
    
      78
        void process_row(size_t width, Columns<const int16_t> mapxy,
    
                         Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
    
      2474
          auto vector_path = [&](size_t step) {
    
      2396
            uint16x8_t x0, y0, x1, y1;
    
      2396
            uint16x8_t xfrac, yfrac;
    
      2396
            uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
    
      2396
            get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
    
                                     y1, xfrac, yfrac, in_range_a, in_range_b,
    
                                     in_range_c, in_range_d);
    
      2396
            uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
    
      2396
            uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
    
      2396
            uint16x8x4_t res;
    
      4792
            get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
    
      2396
                            vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2396
                            offsets_d, v_src_element_stride_);
    
      2396
            load_pixels_u16_4ch_constant(
    
      2396
                src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
    
      2396
                low16_to_s32(in_range_a), low16_to_s32(in_range_b),
    
      2396
                low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a_low,
    
                a_high, b_low, b_high, c_low, c_high, d_low, d_high);
    
            // Doubled fractions 001122..., low part
    
      2396
            uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
    
      2396
            uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
    
      4792
            uint16x8_t nxfrac2 =
    
      2396
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      4792
            uint16x8_t nyfrac2 =
    
      2396
                vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      2396
            res.val[0] =
    
      4792
                interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
    
      2396
                            vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
    
      2396
                            vzip1q(nyfrac2, nyfrac2));
    
      2396
            res.val[1] =
    
      4792
                interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
    
      2396
                            vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
    
      2396
                            vzip2q(nyfrac2, nyfrac2));
    
      4792
            get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
    
      2396
                            vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
    
      2396
                            offsets_d, v_src_element_stride_);
    
      2396
            load_pixels_u16_4ch_constant(
    
      2396
                src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
    
      2396
                hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
    
      2396
                hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a_low,
    
                a_high, b_low, b_high, c_low, c_high, d_low, d_high);
    
            // Doubled fractions 001122..., high part
    
      2396
            xfrac2 = vzip2q(xfrac, xfrac);
    
      2396
            yfrac2 = vzip2q(yfrac, yfrac);
    
      2396
            nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
    
      2396
            nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
    
            // Quadrupled fractions (00001111) are passed to interpolate
    
      2396
            res.val[2] =
    
      4792
                interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
    
      2396
                            vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
    
      2396
                            vzip1q(nyfrac2, nyfrac2));
    
      2396
            res.val[3] =
    
      4792
                interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
    
      2396
                            vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
    
      2396
                            vzip2q(nyfrac2, nyfrac2));
    
      2396
            store_pixels_u16_4ch(res, dst);
    
      2396
            mapxy += ptrdiff_t(step);
    
      2396
            mapfrac += ptrdiff_t(step);
    
      2396
            dst += ptrdiff_t(step);
    
      2396
          };
    
      78
          LoopUnroll loop{width, MapVecTraits::num_lanes()};
    
      78
          loop.unroll_once(vector_path);
    
      156
          ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
    
      78
                                static_cast<ptrdiff_t>(loop.remaining_length());
    
      78
          mapxy -= back_step;
    
      78
          mapfrac -= back_step;
    
      78
          dst -= back_step;
    
      115
          loop.remaining([&](size_t, size_t step) { vector_path(step); });
    
      78
        }
    
       private:
    
        Rows<const ScalarType> src_rows_;
    
        uint16x4_t v_src_element_stride_;
    
        uint16x8_t v_width_;
    
        uint16x8_t v_height_;
    
        uint16x8_t v_border_;
    
      };  // end of class RemapS16Point5Constant4ch<uint16_t>
    
      // Most of the complexity comes from parameter checking.
    
      // NOLINTBEGIN(readability-function-cognitive-complexity)
    
      template <typename T>
    
      570
      kleidicv_error_t remap_s16point5(
    
          const T *src, size_t src_stride, size_t src_width, size_t src_height,
    
          T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
    
          size_t channels, const int16_t *mapxy, size_t mapxy_stride,
    
          const uint16_t *mapfrac, size_t mapfrac_stride,
    
          [[maybe_unused]] kleidicv_border_type_t border_type,
    
          [[maybe_unused]] const T *border_value) {
    
        8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 284 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 284 times.

      570
        CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
    
        8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 283 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 283 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 283 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 283 times.

      568
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
    
        8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 282 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 282 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 282 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 282 times.

      566
        CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
    
        8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 281 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 281 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 281 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 281 times.

      564
        CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height);
    
        12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 280 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 278 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 278 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 280 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 278 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 278 times.

      562
        CHECK_IMAGE_SIZE(src_width, src_height);
    
        12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 277 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 276 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 276 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 277 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 276 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 276 times.

      556
        CHECK_IMAGE_SIZE(dst_width, dst_height);
    
        8/8✓ Branch 0 taken 134 times.
✓ Branch 1 taken 142 times.
✓ Branch 2 taken 133 times.
✓ Branch 3 taken 1 times.
✓ Branch 4 taken 134 times.
✓ Branch 5 taken 142 times.
✓ Branch 6 taken 133 times.
✓ Branch 7 taken 1 times.

      552
        if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
    
      2
          return KLEIDICV_ERROR_NULL_POINTER;
    
        }
    
        8/8✓ Branch 0 taken 265 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 265 times.
✓ Branch 3 taken 10 times.
✓ Branch 4 taken 265 times.
✓ Branch 5 taken 10 times.
✓ Branch 6 taken 265 times.
✓ Branch 7 taken 10 times.

      1100
        if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height,
    
      550
                                               dst_width, border_type, channels)) {
    
      20
          return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
        }
    
      530
        Rows<const T> src_rows{src, src_stride, channels};
    
      530
        Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
    
      530
        Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1};
    
      530
        Rows<T> dst_rows{dst, dst_stride, channels};
    
      530
        Rectangle rect{dst_width, dst_height};
    
        4/4✓ Branch 0 taken 133 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 133 times.
✓ Branch 3 taken 132 times.

      530
        if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
    
        4/4✓ Branch 0 taken 66 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 66 times.
✓ Branch 3 taken 66 times.

      264
          if (channels == 1) {
    
      264
            RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height,
    
      132
                                                      border_value};
    
      132
            zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
    
      132
          } else {
    
            assert(channels == 4);
    
      264
            RemapS16Point5Constant4ch<T> operation{src_rows, src_width, src_height,
    
      132
                                                   border_value};
    
      132
            zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
    
      132
          }
    
      264
        } else {
    
          assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
    
        4/4✓ Branch 0 taken 67 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 67 times.
✓ Branch 3 taken 66 times.

      266
          if (channels == 1) {
    
      134
            RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height};
    
      134
            zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
    
      134
          } else {
    
            assert(channels == 4);
    
      132
            RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height};
    
      132
            zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
    
      132
          }
    
        }
    
      530
        return KLEIDICV_OK;
    
      570
      }
    
      // NOLINTEND(readability-function-cognitive-complexity)
    
      #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type)                    \
    
        template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>(    \
    
            const type *src, size_t src_stride, size_t src_width, size_t src_height, \
    
            type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
    
            size_t channels, const int16_t *mapxy, size_t mapxy_stride,              \
    
            const uint16_t *mapfrac, size_t mapfrac_stride,                          \
    
            kleidicv_border_type_t border_type, const type *border_value)
    
      KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
    
      KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
    
      }  // namespace kleidicv::neon