| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cassert> | ||
| 6 | |||
| 7 | #include "kleidicv/neon.h" | ||
| 8 | #include "kleidicv/transform/remap.h" | ||
| 9 | |||
| 10 | namespace kleidicv::neon { | ||
| 11 | |||
| 12 | template <typename ScalarType> | ||
| 13 | class RemapS16Point5Replicate; | ||
| 14 | |||
| 15 | template <> | ||
| 16 | class RemapS16Point5Replicate<uint8_t> { | ||
| 17 | public: | ||
| 18 | using ScalarType = uint8_t; | ||
| 19 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 20 | using MapVectorType = typename MapVecTraits::VectorType; | ||
| 21 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
| 22 | using FracVecTraits = neon::VecTraits<uint16_t>; | ||
| 23 | using FracVectorType = typename FracVecTraits::VectorType; | ||
| 24 | |||
| 25 | 134 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
| 26 | size_t src_height) | ||
| 27 | 134 | : src_rows_{src_rows}, | |
| 28 | 134 | v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
| 29 | 134 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
| 30 | 134 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {} | |
| 31 | |||
| 32 | 158 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 33 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 34 | 5432 | auto vector_path = [&](size_t step) { | |
| 35 | 5274 | MapVector2Type xy = vld2q_s16(&mapxy[0]); | |
| 36 | 5274 | FracVectorType frac = vld1q_u16(&mapfrac[0]); | |
| 37 | 10548 | uint16x8_t xfrac = | |
| 38 | 10548 | vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), | |
| 39 | // extract xfrac = frac[0:4] | ||
| 40 | 5274 | vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); | |
| 41 | 10548 | uint16x8_t yfrac = | |
| 42 | 10548 | vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), | |
| 43 | // extract yfrac = frac[5:9] | ||
| 44 | 10548 | vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), | |
| 45 | 5274 | vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); | |
| 46 | 5274 | uint16x8_t nxfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
| 47 | 5274 | uint16x8_t nyfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
| 48 | |||
| 49 | // Clamp coordinates to within the dimensions of the source image | ||
| 50 | 10548 | uint16x8_t x0 = vreinterpretq_u16_s16( | |
| 51 | 5274 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); | |
| 52 | 10548 | uint16x8_t y0 = vreinterpretq_u16_s16( | |
| 53 | 5274 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); | |
| 54 | |||
| 55 | // x1 = x0 + 1, except if it's already xmax | ||
| 56 | 5274 | uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_)); | |
| 57 | 5274 | uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_)); | |
| 58 | |||
| 59 | 10548 | uint16x4_t dst_low = load_and_interpolate( | |
| 60 | 5274 | vmovl_u16(vget_low_u16(x0)), vget_low_u16(y0), | |
| 61 | 5274 | vmovl_u16(vget_low_u16(x1)), vget_low_u16(y1), vget_low_u16(xfrac), | |
| 62 | 5274 | vget_low_u16(yfrac), vget_low_u16(nxfrac), vget_low_u16(nyfrac)); | |
| 63 | |||
| 64 | 10548 | uint16x4_t dst_high = load_and_interpolate( | |
| 65 | 5274 | vmovl_high_u16(x0), vget_high_u16(y0), vmovl_high_u16(x1), | |
| 66 | 5274 | vget_high_u16(y1), vget_high_u16(xfrac), vget_high_u16(yfrac), | |
| 67 | 5274 | vget_high_u16(nxfrac), vget_high_u16(nyfrac)); | |
| 68 | |||
| 69 | 5274 | vst1_u8(&dst[0], vuzp1_u8(dst_low, dst_high)); | |
| 70 | 5274 | mapxy += ptrdiff_t(step); | |
| 71 | 5274 | mapfrac += ptrdiff_t(step); | |
| 72 | 5274 | dst += ptrdiff_t(step); | |
| 73 | 5274 | }; | |
| 74 | 158 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 75 | 158 | loop.unroll_once(vector_path); | |
| 76 | 316 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 77 | 158 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 78 | 158 | mapxy -= back_step; | |
| 79 | 158 | mapfrac -= back_step; | |
| 80 | 158 | dst -= back_step; | |
| 81 | 232 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 82 | 158 | } | |
| 83 | |||
| 84 | private: | ||
| 85 | 10548 | uint16x4_t load_and_interpolate(uint32x4_t x0, uint16x4_t y0, uint32x4_t x1, | |
| 86 | uint16x4_t y1, uint16x4_t xfrac, | ||
| 87 | uint16x4_t yfrac, uint16x4_t nxfrac, | ||
| 88 | uint16x4_t nyfrac) { | ||
| 89 | // Calculate offsets from coordinates (y * stride + x) | ||
| 90 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
| 91 | 10548 | uint32x4_t offset = vmlal_u16(x0, y0, v_src_stride_); | |
| 92 | 21096 | uint64_t acc = | |
| 93 | 21096 | static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
| 94 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
| 95 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
| 96 | 10548 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
| 97 | 10548 | uint16x4_t a = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
| 98 | |||
| 99 | 10548 | offset = vmlal_u16(x1, y0, v_src_stride_); | |
| 100 | |||
| 101 | 31644 | acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
| 102 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
| 103 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
| 104 | 10548 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
| 105 | 10548 | uint16x4_t b = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
| 106 | |||
| 107 | 10548 | uint16x4_t line0 = vmla_u16(vmul_u16(xfrac, b), nxfrac, a); | |
| 108 | |||
| 109 | 10548 | offset = vmlal_u16(x0, y1, v_src_stride_); | |
| 110 | |||
| 111 | 31644 | acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
| 112 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
| 113 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
| 114 | 10548 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
| 115 | 10548 | uint16x4_t c = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
| 116 | |||
| 117 | 21096 | uint32x4_t line0_lerpd = vmlal_u16( | |
| 118 | 10548 | vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, nyfrac); | |
| 119 | |||
| 120 | 10548 | offset = vmlal_u16(x1, y1, v_src_stride_); | |
| 121 | |||
| 122 | 31644 | acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
| 123 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
| 124 | 21096 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
| 125 | 10548 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
| 126 | 10548 | uint16x4_t d = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
| 127 | |||
| 128 | 10548 | uint16x4_t line1 = vmla_u16(vmul_u16(xfrac, d), nxfrac, c); | |
| 129 | 21096 | return vshrn_n_u32(vmlal_u16(line0_lerpd, line1, yfrac), | |
| 130 | 2 * REMAP16POINT5_FRAC_BITS); | ||
| 131 | 10548 | } | |
| 132 | |||
| 133 | Rows<const ScalarType> src_rows_; | ||
| 134 | uint16x4_t v_src_stride_; | ||
| 135 | int16x8_t v_xmax_; | ||
| 136 | int16x8_t v_ymax_; | ||
| 137 | }; // end of class RemapS16Point5Replicate<uint8_t> | ||
| 138 | |||
| 139 | // Common interpolation function used by all RemapS16Point5 operations except | ||
| 140 | // 1-channel u8 with replicated borders (RemapS16Point5Replicate<uint8_t>) | ||
| 141 | // because that processes one half vector in one step | ||
| 142 | 97170 | static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c, | |
| 143 | uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac, | ||
| 144 | uint16x8_t nxfrac, uint16x8_t nyfrac) { | ||
| 145 | 485850 | auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right, | |
| 146 | uint16x4_t frac, | ||
| 147 | uint16x4_t nfrac) -> uint32x4_t { | ||
| 148 | 388680 | return vmlal_u16(vmull_u16(nfrac, left), frac, right); | |
| 149 | }; | ||
| 150 | |||
| 151 | 291510 | auto interpolate_horizontal_low = [interpolate_horizontal]( | |
| 152 | uint16x8_t left, uint16x8_t right, | ||
| 153 | uint16x8_t frac, | ||
| 154 | uint16x8_t nfrac) -> uint32x4_t { | ||
| 155 | 388680 | return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), | |
| 156 | 194340 | vget_low_u16(frac), vget_low_u16(nfrac)); | |
| 157 | }; | ||
| 158 | |||
| 159 | 291510 | auto interpolate_horizontal_high = [interpolate_horizontal]( | |
| 160 | uint16x8_t left, uint16x8_t right, | ||
| 161 | uint16x8_t frac, | ||
| 162 | uint16x8_t nfrac) -> uint32x4_t { | ||
| 163 | 388680 | return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), | |
| 164 | 194340 | vget_high_u16(frac), vget_high_u16(nfrac)); | |
| 165 | }; | ||
| 166 | |||
| 167 | // Offset pixel values by 0.5 before rounding down. | ||
| 168 | 97170 | const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 169 | |||
| 170 | 291510 | auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac, | |
| 171 | uint32x4_t nfrac) -> uint32x4_t { | ||
| 172 | 194340 | uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac); | |
| 173 | 388680 | return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); | |
| 174 | 194340 | }; | |
| 175 | |||
| 176 | 97170 | uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac); | |
| 177 | 97170 | uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac); | |
| 178 | 97170 | uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac); | |
| 179 | 97170 | uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac); | |
| 180 | |||
| 181 | 194340 | uint32x4_t lo = | |
| 182 | 194340 | interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)), | |
| 183 | 97170 | vmovl_u16(vget_low_u16(nyfrac))); | |
| 184 | 194340 | uint32x4_t hi = interpolate_vertical( | |
| 185 | 97170 | line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac)); | |
| 186 | |||
| 187 | // Discard upper 16 bits of each element (low the precision back to original | ||
| 188 | // 16 bits) | ||
| 189 | 194340 | uint16x8_t result = | |
| 190 | 97170 | vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)); | |
| 191 | 194340 | return result; | |
| 192 | 97170 | } | |
| 193 | |||
| 194 | template <> | ||
| 195 | class RemapS16Point5Replicate<uint16_t> { | ||
| 196 | public: | ||
| 197 | using ScalarType = uint16_t; | ||
| 198 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 199 | |||
| 200 | 134 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
| 201 | size_t src_height) | ||
| 202 | 134 | : src_rows_{src_rows}, | |
| 203 | 268 | v_src_element_stride_{vdupq_n_u16( | |
| 204 | 134 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
| 205 | 134 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
| 206 | 134 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))}, | |
| 207 | 134 | xfrac_{vdupq_n_u16(0)}, | |
| 208 | 134 | yfrac_{vdupq_n_u16(0)}, | |
| 209 | 134 | nxfrac_{vdupq_n_u16(0)}, | |
| 210 | 134 | nyfrac_{vdupq_n_u16(0)}, | |
| 211 | 134 | x0_{vdupq_n_s16(0)}, | |
| 212 | 134 | x1_{vdupq_n_s16(0)}, | |
| 213 | 134 | y0_{vdupq_n_s16(0)}, | |
| 214 | 134 | y1_{vdupq_n_s16(0)} {} | |
| 215 | |||
| 216 | 158 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 217 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 218 | 5132 | auto vector_path = [&](size_t step) { | |
| 219 | 4974 | prepare_maps(mapxy, mapfrac); | |
| 220 | 4974 | transform_pixels(dst); | |
| 221 | |||
| 222 | 4974 | mapxy += ptrdiff_t(step); | |
| 223 | 4974 | mapfrac += ptrdiff_t(step); | |
| 224 | 4974 | dst += ptrdiff_t(step); | |
| 225 | 4974 | }; | |
| 226 | 158 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 227 | 158 | loop.unroll_once(vector_path); | |
| 228 | 316 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 229 | 158 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 230 | 158 | mapxy -= back_step; | |
| 231 | 158 | mapfrac -= back_step; | |
| 232 | 158 | dst -= back_step; | |
| 233 | 232 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 234 | 158 | } | |
| 235 | |||
| 236 | 4974 | void prepare_maps(Columns<const int16_t> mapxy, | |
| 237 | Columns<const uint16_t> mapfrac) { | ||
| 238 | 4974 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
| 239 | 4974 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
| 240 | 4974 | uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); | |
| 241 | 4974 | uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); | |
| 242 | 9948 | xfrac_ = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), | |
| 243 | 4974 | vandq_u16(frac, frac_mask)); | |
| 244 | 4974 | yfrac_ = vbslq_u16( | |
| 245 | 4974 | vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), | |
| 246 | 4974 | vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask)); | |
| 247 | 4974 | nxfrac_ = vsubq_u16(frac_max, xfrac_); | |
| 248 | 4974 | nyfrac_ = vsubq_u16(frac_max, yfrac_); | |
| 249 | |||
| 250 | // Clamp coordinates to within the dimensions of the source image | ||
| 251 | 4974 | x0_ = vreinterpretq_u16_s16( | |
| 252 | 4974 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); | |
| 253 | 4974 | y0_ = vreinterpretq_u16_s16( | |
| 254 | 4974 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); | |
| 255 | |||
| 256 | // x1 = x0 + 1, except if it's already xmax | ||
| 257 | 4974 | x1_ = vsubq_u16(x0_, vcltq_s16(xy.val[0], v_xmax_)); | |
| 258 | 4974 | y1_ = vsubq_u16(y0_, vcltq_s16(xy.val[1], v_ymax_)); | |
| 259 | 4974 | } | |
| 260 | |||
| 261 | 4974 | void transform_pixels(Columns<uint16_t> dst) { | |
| 262 | 4974 | uint16x8_t a = load_pixels(x0_, y0_); | |
| 263 | 4974 | uint16x8_t b = load_pixels(x1_, y0_); | |
| 264 | 4974 | uint16x8_t c = load_pixels(x0_, y1_); | |
| 265 | 4974 | uint16x8_t d = load_pixels(x1_, y1_); | |
| 266 | |||
| 267 | 9948 | uint16x8_t result = | |
| 268 | 4974 | interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_); | |
| 269 | |||
| 270 | 4974 | vst1q_u16(&dst[0], result); | |
| 271 | 4974 | } | |
| 272 | |||
| 273 | 19896 | uint16x8_t load_pixels(int16x8_t x, int16x8_t y) { | |
| 274 | // Clamp coordinates to within the dimensions of the source image | ||
| 275 | 39792 | uint16x8_t x_clamped = | |
| 276 | 19896 | vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(x, vdupq_n_s16(0))), v_xmax_); | |
| 277 | 39792 | uint16x8_t y_clamped = | |
| 278 | 19896 | vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y, vdupq_n_s16(0))), v_ymax_); | |
| 279 | |||
| 280 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
| 281 | 39792 | uint32x4_t indices_low = | |
| 282 | 39792 | vmlal_u16(vmovl_u16(vget_low_u16(x_clamped)), vget_low_u16(y_clamped), | |
| 283 | 19896 | vget_low_u16(v_src_element_stride_)); | |
| 284 | 39792 | uint32x4_t indices_high = vmlal_high_u16(vmovl_high_u16(x_clamped), | |
| 285 | 19896 | y_clamped, v_src_element_stride_); | |
| 286 | |||
| 287 | // Read pixels from source | ||
| 288 | 179064 | uint16x8_t pixels = { | |
| 289 | 19896 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
| 290 | 19896 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
| 291 | 19896 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
| 292 | 19896 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
| 293 | 19896 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
| 294 | 19896 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
| 295 | 19896 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
| 296 | 19896 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
| 297 | }; | ||
| 298 | |||
| 299 | 39792 | return pixels; | |
| 300 | 19896 | } | |
| 301 | |||
| 302 | private: | ||
| 303 | Rows<const ScalarType> src_rows_; | ||
| 304 | uint16x8_t v_src_element_stride_; | ||
| 305 | int16x8_t v_xmax_; | ||
| 306 | int16x8_t v_ymax_; | ||
| 307 | uint16x8_t xfrac_; | ||
| 308 | uint16x8_t yfrac_; | ||
| 309 | uint16x8_t nxfrac_; | ||
| 310 | uint16x8_t nyfrac_; | ||
| 311 | int16x8_t x0_; | ||
| 312 | int16x8_t x1_; | ||
| 313 | int16x8_t y0_; | ||
| 314 | int16x8_t y1_; | ||
| 315 | }; // end of class RemapS16Point5Replicate<uint16_t> | ||
| 316 | |||
| 317 | template <typename ScalarType> | ||
| 318 | class RemapS16Point5ConstantBorder; | ||
| 319 | |||
| 320 | template <> | ||
| 321 | class RemapS16Point5ConstantBorder<uint8_t> { | ||
| 322 | public: | ||
| 323 | using ScalarType = uint8_t; | ||
| 324 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 325 | |||
| 326 | 132 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
| 327 | size_t src_width, size_t src_height, | ||
| 328 | const ScalarType *border_value) | ||
| 329 | 132 | : src_rows_{src_rows}, | |
| 330 | 132 | v_src_stride_{vdupq_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
| 331 | 132 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
| 332 | 132 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
| 333 | 132 | v_border_{vdupq_n_u16(static_cast<uint16_t>(*border_value))} {} | |
| 334 | |||
| 335 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 336 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 337 | 5428 | auto vector_path = [&](size_t step) { | |
| 338 | 5272 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
| 339 | 5272 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
| 340 | 5272 | uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); | |
| 341 | 5272 | uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); | |
| 342 | 5272 | uint16x8_t xfrac = vandq_u16(frac, frac_mask); | |
| 343 | 10544 | uint16x8_t yfrac = | |
| 344 | 5272 | vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); | |
| 345 | 5272 | uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac); | |
| 346 | 5272 | uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac); | |
| 347 | |||
| 348 | 5272 | uint16x8_t one = vdupq_n_u16(1); | |
| 349 | 5272 | uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]); | |
| 350 | 5272 | uint16x8_t y0 = vreinterpretq_u16_s16(xy.val[1]); | |
| 351 | 5272 | uint16x8_t x1 = vaddq_u16(x0, one); | |
| 352 | 5272 | uint16x8_t y1 = vaddq_u16(y0, one); | |
| 353 | |||
| 354 | 10544 | uint16x8_t a = load_pixels_or_constant_border( | |
| 355 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0); | |
| 356 | 10544 | uint16x8_t b = load_pixels_or_constant_border( | |
| 357 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0); | |
| 358 | 10544 | uint16x8_t c = load_pixels_or_constant_border( | |
| 359 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1); | |
| 360 | 10544 | uint16x8_t d = load_pixels_or_constant_border( | |
| 361 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1); | |
| 362 | |||
| 363 | 5272 | uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac); | |
| 364 | |||
| 365 | 5272 | vst1_u8(&dst[0], vqmovn_u16(result)); | |
| 366 | 5272 | mapxy += ptrdiff_t(step); | |
| 367 | 5272 | mapfrac += ptrdiff_t(step); | |
| 368 | 5272 | dst += ptrdiff_t(step); | |
| 369 | 5272 | }; | |
| 370 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 371 | 156 | loop.unroll_once(vector_path); | |
| 372 | 312 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 373 | 156 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 374 | 156 | mapxy -= back_step; | |
| 375 | 156 | mapfrac -= back_step; | |
| 376 | 156 | dst -= back_step; | |
| 377 | 230 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 378 | 156 | } | |
| 379 | |||
| 380 | private: | ||
| 381 | 21088 | uint16x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_, | |
| 382 | uint16x8_t v_src_element_stride_, | ||
| 383 | uint16x8_t v_width_, | ||
| 384 | uint16x8_t v_height_, | ||
| 385 | uint16x8_t v_border_, uint16x8_t x, | ||
| 386 | uint16x8_t y) { | ||
| 387 | // Find whether coordinates are within the image dimensions. | ||
| 388 | // Negative coordinates are interpreted as large values due to the s16->u16 | ||
| 389 | // reinterpretation. | ||
| 390 | 42176 | uint16x8_t in_range = | |
| 391 | 42176 | vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_), | |
| 392 | 21088 | vcltq_u16(vreinterpretq_u16_s16(y), v_height_)); | |
| 393 | |||
| 394 | // Zero out-of-range coordinates. | ||
| 395 | 21088 | x = vandq_u16(in_range, x); | |
| 396 | 21088 | y = vandq_u16(in_range, y); | |
| 397 | |||
| 398 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
| 399 | 42176 | uint32x4_t indices_low = | |
| 400 | 42176 | vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), | |
| 401 | 21088 | vget_low_u16(v_src_element_stride_)); | |
| 402 | 42176 | uint32x4_t indices_high = | |
| 403 | 21088 | vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); | |
| 404 | |||
| 405 | // Read pixels from source | ||
| 406 | 189792 | uint8x8_t pixels = { | |
| 407 | 21088 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
| 408 | 21088 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
| 409 | 21088 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
| 410 | 21088 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
| 411 | 21088 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
| 412 | 21088 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
| 413 | 21088 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
| 414 | 21088 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
| 415 | }; | ||
| 416 | // Select between source pixels and border colour | ||
| 417 | 42176 | return vbslq_u16(in_range, vmovl_u8(pixels), v_border_); | |
| 418 | 21088 | } | |
| 419 | |||
| 420 | Rows<const ScalarType> src_rows_; | ||
| 421 | uint16x8_t v_src_stride_; | ||
| 422 | uint16x8_t v_width_; | ||
| 423 | uint16x8_t v_height_; | ||
| 424 | uint16x8_t v_border_; | ||
| 425 | }; // end of class RemapS16Point5ConstantBorder<uint8_t> | ||
| 426 | |||
| 427 | template <> | ||
| 428 | class RemapS16Point5ConstantBorder<uint16_t> { | ||
| 429 | public: | ||
| 430 | using ScalarType = uint16_t; | ||
| 431 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 432 | |||
| 433 | 132 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
| 434 | size_t src_width, size_t src_height, | ||
| 435 | const ScalarType *border_value) | ||
| 436 | 132 | : src_rows_{src_rows}, | |
| 437 | 264 | v_src_element_stride_{vdupq_n_u16( | |
| 438 | 132 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
| 439 | 132 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
| 440 | 132 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
| 441 | 132 | v_border_{vdupq_n_u16(*border_value)}, | |
| 442 | 132 | xfrac_{vdupq_n_u16(0)}, | |
| 443 | 132 | yfrac_{vdupq_n_u16(0)}, | |
| 444 | 132 | nxfrac_{vdupq_n_u16(0)}, | |
| 445 | 132 | nyfrac_{vdupq_n_u16(0)}, | |
| 446 | 132 | x0_{vdupq_n_s16(0)}, | |
| 447 | 132 | x1_{vdupq_n_s16(0)}, | |
| 448 | 132 | y0_{vdupq_n_s16(0)}, | |
| 449 | 132 | y1_{vdupq_n_s16(0)} {} | |
| 450 | |||
| 451 | 4972 | void prepare_maps(Columns<const int16_t> mapxy, | |
| 452 | Columns<const uint16_t> mapfrac) { | ||
| 453 | 4972 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
| 454 | 4972 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
| 455 | 4972 | uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); | |
| 456 | 4972 | uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); | |
| 457 | 4972 | xfrac_ = vandq_u16(frac, frac_mask); | |
| 458 | 4972 | yfrac_ = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); | |
| 459 | 4972 | nxfrac_ = vsubq_u16(frac_max, xfrac_); | |
| 460 | 4972 | nyfrac_ = vsubq_u16(frac_max, yfrac_); | |
| 461 | |||
| 462 | 4972 | uint16x8_t one = vdupq_n_u16(1); | |
| 463 | 4972 | x0_ = xy.val[0]; | |
| 464 | 4972 | y0_ = xy.val[1]; | |
| 465 | 4972 | x1_ = vaddq_u16(x0_, one); | |
| 466 | 4972 | y1_ = vaddq_u16(y0_, one); | |
| 467 | 4972 | } | |
| 468 | |||
| 469 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 470 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 471 | 5128 | auto vector_path = [&](size_t step) { | |
| 472 | 4972 | prepare_maps(mapxy, mapfrac); | |
| 473 | 4972 | transform_pixels(dst); | |
| 474 | |||
| 475 | 4972 | mapxy += ptrdiff_t(step); | |
| 476 | 4972 | mapfrac += ptrdiff_t(step); | |
| 477 | 4972 | dst += ptrdiff_t(step); | |
| 478 | 4972 | }; | |
| 479 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 480 | 156 | loop.unroll_once(vector_path); | |
| 481 | 312 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 482 | 156 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 483 | 156 | mapxy -= back_step; | |
| 484 | 156 | mapfrac -= back_step; | |
| 485 | 156 | dst -= back_step; | |
| 486 | 230 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 487 | 156 | } | |
| 488 | |||
| 489 | 4972 | void transform_pixels(Columns<uint16_t> dst) { | |
| 490 | 4972 | uint16x8_t a = load_pixels(x0_, y0_); | |
| 491 | 4972 | uint16x8_t b = load_pixels(x1_, y0_); | |
| 492 | 4972 | uint16x8_t c = load_pixels(x0_, y1_); | |
| 493 | 4972 | uint16x8_t d = load_pixels(x1_, y1_); | |
| 494 | |||
| 495 | 9944 | uint16x8_t result = | |
| 496 | 4972 | interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_); | |
| 497 | |||
| 498 | 4972 | vst1q_u16(&dst[0], result); | |
| 499 | 4972 | } | |
| 500 | |||
| 501 | 19888 | uint16x8_t load_pixels(uint16x8_t x, uint16x8_t y) { | |
| 502 | // Find whether coordinates are within the image dimensions. | ||
| 503 | // Negative coordinates are interpreted as large values due to the s16->u16 | ||
| 504 | // reinterpretation. | ||
| 505 | 39776 | uint16x8_t in_range = | |
| 506 | 39776 | vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_), | |
| 507 | 19888 | vcltq_u16(vreinterpretq_u16_s16(y), v_height_)); | |
| 508 | |||
| 509 | // Zero out-of-range coordinates. | ||
| 510 | 19888 | x = vandq_u16(in_range, x); | |
| 511 | 19888 | y = vandq_u16(in_range, y); | |
| 512 | |||
| 513 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
| 514 | 39776 | uint32x4_t indices_low = | |
| 515 | 39776 | vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), | |
| 516 | 19888 | vget_low_u16(v_src_element_stride_)); | |
| 517 | 39776 | uint32x4_t indices_high = | |
| 518 | 19888 | vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); | |
| 519 | |||
| 520 | // Read pixels from source | ||
| 521 | 178992 | uint16x8_t pixels = { | |
| 522 | 19888 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
| 523 | 19888 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
| 524 | 19888 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
| 525 | 19888 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
| 526 | 19888 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
| 527 | 19888 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
| 528 | 19888 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
| 529 | 19888 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
| 530 | }; | ||
| 531 | // Select between source pixels and border colour | ||
| 532 | 39776 | return vbslq_u16(in_range, pixels, v_border_); | |
| 533 | 19888 | } | |
| 534 | |||
| 535 | private: | ||
| 536 | Rows<const ScalarType> src_rows_; | ||
| 537 | uint16x8_t v_src_element_stride_; | ||
| 538 | uint16x8_t v_width_; | ||
| 539 | uint16x8_t v_height_; | ||
| 540 | uint16x8_t v_border_; | ||
| 541 | uint16x8_t xfrac_; | ||
| 542 | uint16x8_t yfrac_; | ||
| 543 | uint16x8_t nxfrac_; | ||
| 544 | uint16x8_t nyfrac_; | ||
| 545 | int16x8_t x0_; | ||
| 546 | int16x8_t x1_; | ||
| 547 | int16x8_t y0_; | ||
| 548 | int16x8_t y1_; | ||
| 549 | }; // end of class RemapS16Point5ConstantBorder<uint16_t> | ||
| 550 | |||
| 551 | 20488 | inline void get_coordinates(Columns<const int16_t> mapxy, | |
| 552 | Columns<const uint16_t> mapfrac, uint16x8_t &x, | ||
| 553 | uint16x8_t &y, uint16x8_t &xfrac, | ||
| 554 | uint16x8_t &yfrac) { | ||
| 555 | 20488 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
| 556 | 20488 | x = xy.val[0]; | |
| 557 | 20488 | y = xy.val[1]; | |
| 558 | |||
| 559 | 20488 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
| 560 | 20488 | xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 561 | 40976 | yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), | |
| 562 | 20488 | vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 563 | 20488 | } | |
| 564 | |||
| 565 | 40976 | inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1, | |
| 566 | uint16x4_t y1, uint32x4_t &offsets_a, | ||
| 567 | uint32x4_t &offsets_b, uint32x4_t &offsets_c, | ||
| 568 | uint32x4_t &offsets_d, | ||
| 569 | uint16x4_t v_src_element_stride) { | ||
| 570 | // Multiply by 4 because of channels | ||
| 571 | 40976 | uint32x4_t x0_scaled = vshll_n_u16(x0, 2); | |
| 572 | 40976 | uint32x4_t x1_scaled = vshll_n_u16(x1, 2); | |
| 573 | |||
| 574 | // Calculate offsets from coordinates (y * element_stride + x) | ||
| 575 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
| 576 | 40976 | offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride); | |
| 577 | 40976 | offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride); | |
| 578 | 40976 | offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride); | |
| 579 | 40976 | offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride); | |
| 580 | 40976 | } | |
| 581 | |||
| 582 | inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low, | ||
| 583 | uint8_t frac_high) { | ||
| 584 | uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low, | ||
| 585 | frac_high, frac_high, frac_high, frac_high}; | ||
| 586 | return vmovl_u8(frac_low_high); | ||
| 587 | } | ||
| 588 | |||
| 589 | 337408 | inline uint64_t load_32bit(const uint8_t *src) { | |
| 590 | 337408 | uint32_t value = 0; | |
| 591 | 337408 | memcpy(&value, src, sizeof(uint32_t)); | |
| 592 | 674816 | return static_cast<uint64_t>(value); | |
| 593 | 337408 | } | |
| 594 | |||
| 595 | 84352 | inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows, | |
| 596 | uint32x4_t offsets) { | ||
| 597 | 168704 | uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) | | |
| 598 | 84352 | (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32); | |
| 599 | 168704 | uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) | | |
| 600 | 84352 | (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32); | |
| 601 | 168704 | return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23)); | |
| 602 | 84352 | } | |
| 603 | |||
| 604 | 10544 | inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) { | |
| 605 | using ScalarType = uint8_t; | ||
| 606 | 10544 | neon::VecTraits<ScalarType>::store(res, &dst[0]); | |
| 607 | 10544 | } | |
| 608 | |||
| 609 | 159104 | inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows, | |
| 610 | uint32x2_t offsets) { | ||
| 611 | 318208 | return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]), | |
| 612 | 159104 | vld1_u16(&src_rows[vget_lane_u32(offsets, 1)])); | |
| 613 | } | ||
| 614 | |||
| 615 | 9944 | inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) { | |
| 616 | using ScalarType = uint16_t; | ||
| 617 | 9944 | neon::VecTraits<ScalarType>::store(res, &dst[0]); | |
| 618 | 9944 | } | |
| 619 | |||
| 620 | // Replicate border specific functions | ||
| 621 | 10244 | inline void get_coordinates_replicate(Columns<const int16_t> mapxy, | |
| 622 | Columns<const uint16_t> mapfrac, | ||
| 623 | uint16x8_t &x0, uint16x8_t &y0, | ||
| 624 | uint16x8_t &x1, uint16x8_t &y1, | ||
| 625 | uint16x8_t &xfrac, uint16x8_t &yfrac, | ||
| 626 | int16x8_t v_xmax, int16x8_t v_ymax) { | ||
| 627 | 10244 | get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); | |
| 628 | |||
| 629 | // Zero the xfrac (or yfrac) if x (or y) are below zero | ||
| 630 | 10244 | xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac); | |
| 631 | 10244 | yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac); | |
| 632 | |||
| 633 | // Clamp coordinates to within the dimensions of the source image | ||
| 634 | 10244 | x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax))); | |
| 635 | 10244 | y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax))); | |
| 636 | |||
| 637 | // x1 = x0 + 1, except if it's already xmax | ||
| 638 | 10244 | x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax)); | |
| 639 | 10244 | y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax)); | |
| 640 | 10244 | } | |
| 641 | |||
| 642 | 10544 | inline void load_pixels_u8_4ch_replicate( | |
| 643 | Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
| 644 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b, | ||
| 645 | uint8x16_t &c, uint8x16_t &d) { | ||
| 646 | 10544 | a = load_4px_4ch(src_rows, offsets_a); | |
| 647 | 10544 | b = load_4px_4ch(src_rows, offsets_b); | |
| 648 | 10544 | c = load_4px_4ch(src_rows, offsets_c); | |
| 649 | 10544 | d = load_4px_4ch(src_rows, offsets_d); | |
| 650 | 10544 | } | |
| 651 | |||
| 652 | 9944 | inline void load_pixels_u16_4ch_replicate( | |
| 653 | Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
| 654 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo, | ||
| 655 | uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo, | ||
| 656 | uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) { | ||
| 657 | 9944 | a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); | |
| 658 | 9944 | b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); | |
| 659 | 9944 | c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); | |
| 660 | 9944 | d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); | |
| 661 | |||
| 662 | 9944 | a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); | |
| 663 | 9944 | b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); | |
| 664 | 9944 | c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); | |
| 665 | 9944 | d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); | |
| 666 | 9944 | } | |
| 667 | |||
| 668 | template <typename ScalarType> | ||
| 669 | class RemapS16Point5Replicate4ch; | ||
| 670 | |||
| 671 | template <> | ||
| 672 | class RemapS16Point5Replicate4ch<uint8_t> { | ||
| 673 | public: | ||
| 674 | using ScalarType = uint8_t; | ||
| 675 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 676 | |||
| 677 | 132 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 678 | size_t src_height) | ||
| 679 | 132 | : src_rows_{src_rows}, | |
| 680 | 132 | v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
| 681 | 132 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
| 682 | 132 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {} | |
| 683 | |||
| 684 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 685 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 686 | 5428 | auto vector_path = [&](size_t step) { | |
| 687 | 5272 | uint16x8_t x0, y0, x1, y1; | |
| 688 | 5272 | uint16x8_t xfrac, yfrac; | |
| 689 | 10544 | get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, | |
| 690 | 5272 | v_xmax_, v_ymax_); | |
| 691 | |||
| 692 | 5272 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
| 693 | 5272 | uint8x16_t a, b, c, d; | |
| 694 | 5272 | uint8x16x2_t res; | |
| 695 | |||
| 696 | 10544 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
| 697 | 5272 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 698 | 5272 | offsets_d, v_src_stride_); | |
| 699 | 10544 | load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
| 700 | 5272 | offsets_d, a, b, c, d); | |
| 701 | |||
| 702 | // Doubled fractions 001122..., low part | ||
| 703 | 5272 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
| 704 | 5272 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
| 705 | 10544 | uint16x8_t nxfrac2 = | |
| 706 | 5272 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 707 | 10544 | uint16x8_t nyfrac2 = | |
| 708 | 5272 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 709 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 710 | 10544 | uint16x8_t res0 = interpolate( | |
| 711 | 5272 | vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)), | |
| 712 | 5272 | vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
| 713 | 5272 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
| 714 | 10544 | uint16x8_t res1 = interpolate( | |
| 715 | 5272 | vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
| 716 | 5272 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2), | |
| 717 | 5272 | vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2)); | |
| 718 | 5272 | res.val[0] = | |
| 719 | 5272 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
| 720 | |||
| 721 | 10544 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
| 722 | 5272 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 723 | 5272 | offsets_d, v_src_stride_); | |
| 724 | 10544 | load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
| 725 | 5272 | offsets_d, a, b, c, d); | |
| 726 | // Doubled fractions 001122..., high part | ||
| 727 | 5272 | xfrac2 = vzip2q(xfrac, xfrac); | |
| 728 | 5272 | yfrac2 = vzip2q(yfrac, yfrac); | |
| 729 | 5272 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 730 | 5272 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 731 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 732 | 10544 | res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), | |
| 733 | 5272 | vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)), | |
| 734 | 5272 | vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
| 735 | 5272 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
| 736 | 10544 | res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
| 737 | 5272 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), | |
| 738 | 5272 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
| 739 | 5272 | vzip2q(nyfrac2, nyfrac2)); | |
| 740 | 5272 | res.val[1] = | |
| 741 | 5272 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
| 742 | |||
| 743 | 5272 | store_pixels_u8_4ch(res, dst); | |
| 744 | 5272 | mapxy += ptrdiff_t(step); | |
| 745 | 5272 | mapfrac += ptrdiff_t(step); | |
| 746 | 5272 | dst += ptrdiff_t(step); | |
| 747 | 5272 | }; | |
| 748 | |||
| 749 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 750 | 156 | loop.unroll_once(vector_path); | |
| 751 | 312 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 752 | 156 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 753 | 156 | mapxy -= back_step; | |
| 754 | 156 | mapfrac -= back_step; | |
| 755 | 156 | dst -= back_step; | |
| 756 | 230 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 757 | 156 | } | |
| 758 | |||
| 759 | private: | ||
| 760 | Rows<const ScalarType> src_rows_; | ||
| 761 | uint16x4_t v_src_stride_; | ||
| 762 | int16x8_t v_xmax_; | ||
| 763 | int16x8_t v_ymax_; | ||
| 764 | }; // end of class RemapS16Point5Replicate4ch<uint8_t> | ||
| 765 | |||
| 766 | template <> | ||
| 767 | class RemapS16Point5Replicate4ch<uint16_t> { | ||
| 768 | public: | ||
| 769 | using ScalarType = uint16_t; | ||
| 770 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 771 | |||
| 772 | 132 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 773 | size_t src_height) | ||
| 774 | 132 | : src_rows_{src_rows}, | |
| 775 | 264 | v_src_element_stride_{vdup_n_u16( | |
| 776 | 132 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
| 777 | 132 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
| 778 | 132 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {} | |
| 779 | |||
| 780 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 781 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 782 | 5128 | auto vector_path = [&](size_t step) { | |
| 783 | 4972 | uint16x8_t x0, y0, x1, y1; | |
| 784 | 4972 | uint16x8_t xfrac, yfrac; | |
| 785 | 9944 | get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, | |
| 786 | 4972 | v_xmax_, v_ymax_); | |
| 787 | |||
| 788 | 4972 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
| 789 | 4972 | uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; | |
| 790 | 4972 | uint16x8x4_t res; | |
| 791 | 9944 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
| 792 | 4972 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 793 | 4972 | offsets_d, v_src_element_stride_); | |
| 794 | 9944 | load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
| 795 | 4972 | offsets_d, a_low, a_high, b_low, b_high, | |
| 796 | c_low, c_high, d_low, d_high); | ||
| 797 | |||
| 798 | // Doubled fractions 001122..., low part | ||
| 799 | 4972 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
| 800 | 4972 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
| 801 | 9944 | uint16x8_t nxfrac2 = | |
| 802 | 4972 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 803 | 9944 | uint16x8_t nyfrac2 = | |
| 804 | 4972 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 805 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 806 | 4972 | res.val[0] = | |
| 807 | 9944 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
| 808 | 4972 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
| 809 | 4972 | vzip1q(nyfrac2, nyfrac2)); | |
| 810 | 4972 | res.val[1] = | |
| 811 | 9944 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
| 812 | 4972 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
| 813 | 4972 | vzip2q(nyfrac2, nyfrac2)); | |
| 814 | |||
| 815 | 9944 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
| 816 | 4972 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 817 | 4972 | offsets_d, v_src_element_stride_); | |
| 818 | 9944 | load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
| 819 | 4972 | offsets_d, a_low, a_high, b_low, b_high, | |
| 820 | c_low, c_high, d_low, d_high); | ||
| 821 | // Doubled fractions 001122..., high part | ||
| 822 | 4972 | xfrac2 = vzip2q(xfrac, xfrac); | |
| 823 | 4972 | yfrac2 = vzip2q(yfrac, yfrac); | |
| 824 | 4972 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 825 | 4972 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 826 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 827 | 4972 | res.val[2] = | |
| 828 | 9944 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
| 829 | 4972 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
| 830 | 4972 | vzip1q(nyfrac2, nyfrac2)); | |
| 831 | 4972 | res.val[3] = | |
| 832 | 9944 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
| 833 | 4972 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
| 834 | 4972 | vzip2q(nyfrac2, nyfrac2)); | |
| 835 | |||
| 836 | 4972 | store_pixels_u16_4ch(res, dst); | |
| 837 | 4972 | mapxy += ptrdiff_t(step); | |
| 838 | 4972 | mapfrac += ptrdiff_t(step); | |
| 839 | 4972 | dst += ptrdiff_t(step); | |
| 840 | 4972 | }; | |
| 841 | |||
| 842 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 843 | 156 | loop.unroll_once(vector_path); | |
| 844 | 312 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 845 | 156 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 846 | 156 | mapxy -= back_step; | |
| 847 | 156 | mapfrac -= back_step; | |
| 848 | 156 | dst -= back_step; | |
| 849 | 230 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 850 | 156 | } | |
| 851 | |||
| 852 | private: | ||
| 853 | Rows<const ScalarType> src_rows_; | ||
| 854 | uint16x4_t v_src_element_stride_; | ||
| 855 | int16x8_t v_xmax_; | ||
| 856 | int16x8_t v_ymax_; | ||
| 857 | }; // end of class RemapS16Point5Replicate4ch<uint16_t> | ||
| 858 | |||
| 859 | // Constant border specific functions | ||
| 860 | 10244 | inline void get_coordinates_constant( | |
| 861 | Columns<const int16_t> mapxy, Columns<const uint16_t> mapfrac, | ||
| 862 | uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0, | ||
| 863 | uint16x8_t &x1, uint16x8_t &y1, uint16x8_t &xfrac, uint16x8_t &yfrac, | ||
| 864 | uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c, | ||
| 865 | uint16x8_t &in_range_d) { | ||
| 866 | 10244 | get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); | |
| 867 | |||
| 868 | 10244 | uint16x8_t one = vdupq_n_u16(1); | |
| 869 | 10244 | x1 = vaddq_u16(x0, one); | |
| 870 | 10244 | y1 = vaddq_u16(y0, one); | |
| 871 | |||
| 872 | 10244 | uint16x8_t x0_in_range = vcltq_u16(x0, v_width); | |
| 873 | 10244 | uint16x8_t y0_in_range = vcltq_u16(y0, v_height); | |
| 874 | 10244 | uint16x8_t x1_in_range = vcltq_u16(x1, v_width); | |
| 875 | 10244 | uint16x8_t y1_in_range = vcltq_u16(y1, v_height); | |
| 876 | |||
| 877 | 10244 | in_range_a = vandq(x0_in_range, y0_in_range); | |
| 878 | 10244 | in_range_b = vandq(x1_in_range, y0_in_range); | |
| 879 | 10244 | in_range_c = vandq(x0_in_range, y1_in_range); | |
| 880 | 10244 | in_range_d = vandq(x1_in_range, y1_in_range); | |
| 881 | 10244 | } | |
| 882 | |||
| 883 | 81952 | inline uint32x4_t zero_out_of_range_offsets(uint32x4_t in_range, | |
| 884 | uint32x4_t offsets) { | ||
| 885 | 81952 | return vbslq_u32(in_range, offsets, vdupq_n_u32(0)); | |
| 886 | } | ||
| 887 | |||
| 888 | 42176 | inline uint8x16_t replace_pixel_with_border_u8_4ch(uint32x4_t in_range, | |
| 889 | uint8x16_t pixels, | ||
| 890 | uint8x16_t v_border) { | ||
| 891 | 42176 | return vreinterpretq_u8_u32( | |
| 892 | 42176 | vbslq_u32(in_range, vreinterpretq_u32_u8(pixels), v_border)); | |
| 893 | } | ||
| 894 | |||
| 895 | 79552 | inline uint16x8_t replace_pixel_with_border_u16_4ch(uint64x2_t in_range, | |
| 896 | uint16x8_t pixels, | ||
| 897 | uint16x8_t v_border) { | ||
| 898 | 79552 | return vreinterpretq_u16_u64( | |
| 899 | 79552 | vbslq_u64(in_range, vreinterpretq_u64_u16(pixels), v_border)); | |
| 900 | } | ||
| 901 | |||
| 902 | 10544 | inline void load_pixels_u8_4ch_constant( | |
| 903 | Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
| 904 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a, | ||
| 905 | uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d, | ||
| 906 | uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c, | ||
| 907 | uint8x16_t &d) { | ||
| 908 | 10544 | offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); | |
| 909 | 10544 | offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); | |
| 910 | 10544 | offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); | |
| 911 | 10544 | offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); | |
| 912 | |||
| 913 | 10544 | a = load_4px_4ch(src_rows, offsets_a); | |
| 914 | 10544 | b = load_4px_4ch(src_rows, offsets_b); | |
| 915 | 10544 | c = load_4px_4ch(src_rows, offsets_c); | |
| 916 | 10544 | d = load_4px_4ch(src_rows, offsets_d); | |
| 917 | |||
| 918 | 10544 | a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border); | |
| 919 | 10544 | b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border); | |
| 920 | 10544 | c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border); | |
| 921 | 10544 | d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border); | |
| 922 | 10544 | } | |
| 923 | |||
| 924 | 9944 | inline void load_pixels_u16_4ch_constant( | |
| 925 | Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
| 926 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a, | ||
| 927 | uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d, | ||
| 928 | uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo, | ||
| 929 | uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo, | ||
| 930 | uint16x8_t &d_hi) { | ||
| 931 | 9944 | offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); | |
| 932 | 9944 | offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); | |
| 933 | 9944 | offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); | |
| 934 | 9944 | offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); | |
| 935 | |||
| 936 | 9944 | a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); | |
| 937 | 9944 | b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); | |
| 938 | 9944 | c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); | |
| 939 | 9944 | d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); | |
| 940 | |||
| 941 | // Convert bitsets such as in_range to 64bits, making all 1s or all 0s | ||
| 942 | 49720 | auto low32_to_u64 = [](uint32x4_t bitset) { | |
| 943 | 39776 | return vreinterpretq_u64_s64( | |
| 944 | 39776 | vmovl_s32(vreinterpret_s32_u32(vget_low_u32(bitset)))); | |
| 945 | }; | ||
| 946 | |||
| 947 | 19888 | a_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_a), a_lo, | |
| 948 | 9944 | v_border); | |
| 949 | 19888 | b_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_b), b_lo, | |
| 950 | 9944 | v_border); | |
| 951 | 19888 | c_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_c), c_lo, | |
| 952 | 9944 | v_border); | |
| 953 | 19888 | d_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_d), d_lo, | |
| 954 | 9944 | v_border); | |
| 955 | |||
| 956 | 9944 | a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); | |
| 957 | 9944 | b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); | |
| 958 | 9944 | c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); | |
| 959 | 9944 | d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); | |
| 960 | |||
| 961 | // Convert bitsets such as in_range to 64bits, making all 1s or all 0s | ||
| 962 | 49720 | auto hi32_to_u64 = [](uint32x4_t bitset) { | |
| 963 | 39776 | return vreinterpretq_u64_s64(vmovl_high_s32(vreinterpretq_s32_u32(bitset))); | |
| 964 | }; | ||
| 965 | |||
| 966 | 19888 | a_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_a), a_hi, | |
| 967 | 9944 | v_border); | |
| 968 | 19888 | b_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_b), b_hi, | |
| 969 | 9944 | v_border); | |
| 970 | 19888 | c_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_c), c_hi, | |
| 971 | 9944 | v_border); | |
| 972 | 19888 | d_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_d), d_hi, | |
| 973 | 9944 | v_border); | |
| 974 | 9944 | } | |
| 975 | |||
| 976 | // Convert bitsets such as in_range to 32bits, making all 1s or all 0s | ||
| 977 | 40976 | static uint32x4_t low16_to_s32(uint16x8_t bitset) { | |
| 978 | 40976 | return vreinterpretq_u32_s32( | |
| 979 | 40976 | vmovl_s16(vreinterpret_s16_u16(vget_low_u16(bitset)))); | |
| 980 | } | ||
| 981 | |||
| 982 | 40976 | static uint32x4_t hi16_to_s32(uint16x8_t bitset) { | |
| 983 | 40976 | return vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(bitset))); | |
| 984 | } | ||
| 985 | |||
| 986 | template <typename ScalarType> | ||
| 987 | class RemapS16Point5Constant4ch; | ||
| 988 | |||
| 989 | template <> | ||
| 990 | class RemapS16Point5Constant4ch<uint8_t> { | ||
| 991 | public: | ||
| 992 | using ScalarType = uint8_t; | ||
| 993 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 994 | |||
| 995 | 132 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 996 | size_t src_height, const ScalarType *border_value) | ||
| 997 | 132 | : src_rows_{src_rows}, | |
| 998 | 132 | v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
| 999 | 132 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
| 1000 | 132 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
| 1001 | 132 | v_border_{} { | |
| 1002 | 132 | uint32_t border_value_32{}; | |
| 1003 | 132 | memcpy(&border_value_32, border_value, sizeof(uint32_t)); | |
| 1004 | 132 | v_border_ = vreinterpretq_u8_u32(vdupq_n_u32(border_value_32)); | |
| 1005 | 132 | } | |
| 1006 | |||
| 1007 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 1008 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 1009 | 5428 | auto vector_path = [&](size_t step) { | |
| 1010 | 5272 | uint16x8_t x0, y0, x1, y1; | |
| 1011 | 5272 | uint16x8_t xfrac, yfrac; | |
| 1012 | 5272 | uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; | |
| 1013 | 5272 | get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, | |
| 1014 | y1, xfrac, yfrac, in_range_a, in_range_b, | ||
| 1015 | in_range_c, in_range_d); | ||
| 1016 | |||
| 1017 | 5272 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
| 1018 | 5272 | uint8x16_t a, b, c, d; | |
| 1019 | 5272 | uint8x16x2_t res; | |
| 1020 | |||
| 1021 | 10544 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
| 1022 | 5272 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 1023 | 5272 | offsets_d, v_src_stride_); | |
| 1024 | |||
| 1025 | 5272 | load_pixels_u8_4ch_constant( | |
| 1026 | 5272 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
| 1027 | 5272 | low16_to_s32(in_range_a), low16_to_s32(in_range_b), | |
| 1028 | 5272 | low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a, b, | |
| 1029 | c, d); | ||
| 1030 | |||
| 1031 | // Doubled fractions 001122..., low part | ||
| 1032 | 5272 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
| 1033 | 5272 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
| 1034 | 10544 | uint16x8_t nxfrac2 = | |
| 1035 | 5272 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 1036 | 10544 | uint16x8_t nyfrac2 = | |
| 1037 | 5272 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 1038 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 1039 | 10544 | uint16x8_t res0 = interpolate( | |
| 1040 | 5272 | vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)), | |
| 1041 | 5272 | vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
| 1042 | 5272 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
| 1043 | 10544 | uint16x8_t res1 = interpolate( | |
| 1044 | 5272 | vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
| 1045 | 5272 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2), | |
| 1046 | 5272 | vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2)); | |
| 1047 | 5272 | res.val[0] = | |
| 1048 | 5272 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
| 1049 | |||
| 1050 | 10544 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
| 1051 | 5272 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 1052 | 5272 | offsets_d, v_src_stride_); | |
| 1053 | |||
| 1054 | 5272 | load_pixels_u8_4ch_constant( | |
| 1055 | 5272 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
| 1056 | 5272 | hi16_to_s32(in_range_a), hi16_to_s32(in_range_b), | |
| 1057 | 5272 | hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a, b, c, | |
| 1058 | d); | ||
| 1059 | // Doubled fractions 001122..., high part | ||
| 1060 | 5272 | xfrac2 = vzip2q(xfrac, xfrac); | |
| 1061 | 5272 | yfrac2 = vzip2q(yfrac, yfrac); | |
| 1062 | 5272 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 1063 | 5272 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 1064 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 1065 | 10544 | res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), | |
| 1066 | 5272 | vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)), | |
| 1067 | 5272 | vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
| 1068 | 5272 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
| 1069 | 10544 | res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
| 1070 | 5272 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), | |
| 1071 | 5272 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
| 1072 | 5272 | vzip2q(nyfrac2, nyfrac2)); | |
| 1073 | 5272 | res.val[1] = | |
| 1074 | 5272 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
| 1075 | |||
| 1076 | 5272 | store_pixels_u8_4ch(res, dst); | |
| 1077 | 5272 | mapxy += ptrdiff_t(step); | |
| 1078 | 5272 | mapfrac += ptrdiff_t(step); | |
| 1079 | 5272 | dst += ptrdiff_t(step); | |
| 1080 | 5272 | }; | |
| 1081 | |||
| 1082 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 1083 | 156 | loop.unroll_once(vector_path); | |
| 1084 | 312 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 1085 | 156 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 1086 | 156 | mapxy -= back_step; | |
| 1087 | 156 | mapfrac -= back_step; | |
| 1088 | 156 | dst -= back_step; | |
| 1089 | 230 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 1090 | 156 | } | |
| 1091 | |||
| 1092 | private: | ||
| 1093 | Rows<const ScalarType> src_rows_; | ||
| 1094 | uint16x4_t v_src_stride_; | ||
| 1095 | uint16x8_t v_width_; | ||
| 1096 | uint16x8_t v_height_; | ||
| 1097 | uint8x16_t v_border_; | ||
| 1098 | }; // end of class RemapS16Point5Constant4ch<uint8_t> | ||
| 1099 | |||
| 1100 | template <> | ||
| 1101 | class RemapS16Point5Constant4ch<uint16_t> { | ||
| 1102 | public: | ||
| 1103 | using ScalarType = uint16_t; | ||
| 1104 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
| 1105 | |||
| 1106 | 132 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 1107 | size_t src_height, const ScalarType *border_value) | ||
| 1108 | 132 | : src_rows_{src_rows}, | |
| 1109 | 264 | v_src_element_stride_{vdup_n_u16( | |
| 1110 | 132 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
| 1111 | 132 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
| 1112 | 132 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
| 1113 | 132 | v_border_{} { | |
| 1114 | 132 | uint64_t border_value_64{}; | |
| 1115 | 132 | memcpy(&border_value_64, border_value, sizeof(uint64_t)); | |
| 1116 | 132 | v_border_ = vreinterpretq_u16_u64(vdupq_n_u64(border_value_64)); | |
| 1117 | 132 | } | |
| 1118 | |||
| 1119 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 1120 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 1121 | 5128 | auto vector_path = [&](size_t step) { | |
| 1122 | 4972 | uint16x8_t x0, y0, x1, y1; | |
| 1123 | 4972 | uint16x8_t xfrac, yfrac; | |
| 1124 | 4972 | uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; | |
| 1125 | 4972 | get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, | |
| 1126 | y1, xfrac, yfrac, in_range_a, in_range_b, | ||
| 1127 | in_range_c, in_range_d); | ||
| 1128 | |||
| 1129 | 4972 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
| 1130 | 4972 | uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; | |
| 1131 | 4972 | uint16x8x4_t res; | |
| 1132 | |||
| 1133 | 9944 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
| 1134 | 4972 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 1135 | 4972 | offsets_d, v_src_element_stride_); | |
| 1136 | |||
| 1137 | 4972 | load_pixels_u16_4ch_constant( | |
| 1138 | 4972 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
| 1139 | 4972 | low16_to_s32(in_range_a), low16_to_s32(in_range_b), | |
| 1140 | 4972 | low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a_low, | |
| 1141 | a_high, b_low, b_high, c_low, c_high, d_low, d_high); | ||
| 1142 | |||
| 1143 | // Doubled fractions 001122..., low part | ||
| 1144 | 4972 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
| 1145 | 4972 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
| 1146 | 9944 | uint16x8_t nxfrac2 = | |
| 1147 | 4972 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 1148 | 9944 | uint16x8_t nyfrac2 = | |
| 1149 | 4972 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 1150 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 1151 | 4972 | res.val[0] = | |
| 1152 | 9944 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
| 1153 | 4972 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
| 1154 | 4972 | vzip1q(nyfrac2, nyfrac2)); | |
| 1155 | 4972 | res.val[1] = | |
| 1156 | 9944 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
| 1157 | 4972 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
| 1158 | 4972 | vzip2q(nyfrac2, nyfrac2)); | |
| 1159 | |||
| 1160 | 9944 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
| 1161 | 4972 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
| 1162 | 4972 | offsets_d, v_src_element_stride_); | |
| 1163 | |||
| 1164 | 4972 | load_pixels_u16_4ch_constant( | |
| 1165 | 4972 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
| 1166 | 4972 | hi16_to_s32(in_range_a), hi16_to_s32(in_range_b), | |
| 1167 | 4972 | hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a_low, | |
| 1168 | a_high, b_low, b_high, c_low, c_high, d_low, d_high); | ||
| 1169 | |||
| 1170 | // Doubled fractions 001122..., high part | ||
| 1171 | 4972 | xfrac2 = vzip2q(xfrac, xfrac); | |
| 1172 | 4972 | yfrac2 = vzip2q(yfrac, yfrac); | |
| 1173 | 4972 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
| 1174 | 4972 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
| 1175 | // Quadrupled fractions (00001111) are passed to interpolate | ||
| 1176 | 4972 | res.val[2] = | |
| 1177 | 9944 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
| 1178 | 4972 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
| 1179 | 4972 | vzip1q(nyfrac2, nyfrac2)); | |
| 1180 | 4972 | res.val[3] = | |
| 1181 | 9944 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
| 1182 | 4972 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
| 1183 | 4972 | vzip2q(nyfrac2, nyfrac2)); | |
| 1184 | |||
| 1185 | 4972 | store_pixels_u16_4ch(res, dst); | |
| 1186 | 4972 | mapxy += ptrdiff_t(step); | |
| 1187 | 4972 | mapfrac += ptrdiff_t(step); | |
| 1188 | 4972 | dst += ptrdiff_t(step); | |
| 1189 | 4972 | }; | |
| 1190 | |||
| 1191 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 1192 | 156 | loop.unroll_once(vector_path); | |
| 1193 | 312 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
| 1194 | 156 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
| 1195 | 156 | mapxy -= back_step; | |
| 1196 | 156 | mapfrac -= back_step; | |
| 1197 | 156 | dst -= back_step; | |
| 1198 | 230 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
| 1199 | 156 | } | |
| 1200 | |||
| 1201 | private: | ||
| 1202 | Rows<const ScalarType> src_rows_; | ||
| 1203 | uint16x4_t v_src_element_stride_; | ||
| 1204 | uint16x8_t v_width_; | ||
| 1205 | uint16x8_t v_height_; | ||
| 1206 | uint16x8_t v_border_; | ||
| 1207 | }; // end of class RemapS16Point5Constant4ch<uint16_t> | ||
| 1208 | |||
| 1209 | // Most of the complexity comes from parameter checking. | ||
| 1210 | // NOLINTBEGIN(readability-function-cognitive-complexity) | ||
| 1211 | template <typename T> | ||
| 1212 | 1140 | kleidicv_error_t remap_s16point5( | |
| 1213 | const T *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 1214 | T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 1215 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
| 1216 | const uint16_t *mapfrac, size_t mapfrac_stride, | ||
| 1217 | [[maybe_unused]] kleidicv_border_type_t border_type, | ||
| 1218 | [[maybe_unused]] const T *border_value) { | ||
| 1219 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 568 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 568 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 568 times.
|
1140 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 1220 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 566 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 566 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 566 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 566 times.
|
1136 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
| 1221 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 564 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 564 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 564 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 564 times.
|
1132 | CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); |
| 1222 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 562 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 562 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 562 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 562 times.
|
1128 | CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height); |
| 1223 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 556 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 556 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 560 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 556 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 556 times.
|
1124 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 1224 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 554 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 552 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 552 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 554 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 552 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 552 times.
|
1112 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
| 1225 |
8/8✓ Branch 0 taken 268 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 268 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 266 times.
✓ Branch 7 taken 2 times.
|
1104 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { |
| 1226 | 4 | return KLEIDICV_ERROR_NULL_POINTER; | |
| 1227 | } | ||
| 1228 | |||
| 1229 |
8/8✓ Branch 0 taken 530 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 530 times.
✓ Branch 3 taken 20 times.
✓ Branch 4 taken 530 times.
✓ Branch 5 taken 20 times.
✓ Branch 6 taken 530 times.
✓ Branch 7 taken 20 times.
|
2200 | if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height, |
| 1230 | 1100 | dst_width, border_type, channels)) { | |
| 1231 | 40 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1232 | } | ||
| 1233 | |||
| 1234 | 1060 | Rows<const T> src_rows{src, src_stride, channels}; | |
| 1235 | 1060 | Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2}; | |
| 1236 | 1060 | Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1}; | |
| 1237 | 1060 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
| 1238 | 1060 | Rectangle rect{dst_width, dst_height}; | |
| 1239 |
4/4✓ Branch 0 taken 266 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 264 times.
|
1060 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { |
| 1240 |
4/4✓ Branch 0 taken 132 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 132 times.
✓ Branch 3 taken 132 times.
|
528 | if (channels == 1) { |
| 1241 | 528 | RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height, | |
| 1242 | 264 | border_value}; | |
| 1243 | 264 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1244 | 264 | } else { | |
| 1245 | assert(channels == 4); | ||
| 1246 | 528 | RemapS16Point5Constant4ch<T> operation{src_rows, src_width, src_height, | |
| 1247 | 264 | border_value}; | |
| 1248 | 264 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1249 | 264 | } | |
| 1250 | 528 | } else { | |
| 1251 | assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); | ||
| 1252 |
4/4✓ Branch 0 taken 134 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 134 times.
✓ Branch 3 taken 132 times.
|
532 | if (channels == 1) { |
| 1253 | 268 | RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height}; | |
| 1254 | 268 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1255 | 268 | } else { | |
| 1256 | assert(channels == 4); | ||
| 1257 | 264 | RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height}; | |
| 1258 | 264 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1259 | 264 | } | |
| 1260 | } | ||
| 1261 | 1060 | return KLEIDICV_OK; | |
| 1262 | 1140 | } | |
| 1263 | // NOLINTEND(readability-function-cognitive-complexity) | ||
| 1264 | |||
| 1265 | #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \ | ||
| 1266 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \ | ||
| 1267 | const type *src, size_t src_stride, size_t src_width, size_t src_height, \ | ||
| 1268 | type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ | ||
| 1269 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, \ | ||
| 1270 | const uint16_t *mapfrac, size_t mapfrac_stride, \ | ||
| 1271 | kleidicv_border_type_t border_type, const type *border_value) | ||
| 1272 | |||
| 1273 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); | ||
| 1274 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); | ||
| 1275 | |||
| 1276 | } // namespace kleidicv::neon | ||
| 1277 |