KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/remap_s16point5_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 790 790 100.0%
Functions: 67 67 100.0%
Branches: 84 84 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6
7 #include "kleidicv/neon.h"
8 #include "kleidicv/transform/remap.h"
9
10 namespace kleidicv::neon {
11
12 template <typename ScalarType>
13 class RemapS16Point5Replicate;
14
15 template <>
16 class RemapS16Point5Replicate<uint8_t> {
17 public:
18 using ScalarType = uint8_t;
19 using MapVecTraits = neon::VecTraits<int16_t>;
20 using MapVectorType = typename MapVecTraits::VectorType;
21 using MapVector2Type = typename MapVecTraits::Vector2Type;
22 using FracVecTraits = neon::VecTraits<uint16_t>;
23 using FracVectorType = typename FracVecTraits::VectorType;
24
25 67 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
26 size_t src_height)
27 67 : src_rows_{src_rows},
28 67 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
29 67 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
30 67 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
31
32 79 void process_row(size_t width, Columns<const int16_t> mapxy,
33 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
34 2536 auto vector_path = [&](size_t step) {
35 2457 MapVector2Type xy = vld2q_s16(&mapxy[0]);
36 2457 FracVectorType frac = vld1q_u16(&mapfrac[0]);
37 4914 uint16x8_t xfrac =
38 4914 vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
39 // extract xfrac = frac[0:4]
40 2457 vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
41 4914 uint16x8_t yfrac =
42 4914 vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
43 // extract yfrac = frac[5:9]
44 4914 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
45 2457 vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
46 2457 uint16x8_t nxfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
47 2457 uint16x8_t nyfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
48
49 // Clamp coordinates to within the dimensions of the source image
50 4914 uint16x8_t x0 = vreinterpretq_u16_s16(
51 2457 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
52 4914 uint16x8_t y0 = vreinterpretq_u16_s16(
53 2457 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
54
55 // x1 = x0 + 1, except if it's already xmax
56 2457 uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_));
57 2457 uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_));
58
59 4914 uint16x4_t dst_low = load_and_interpolate(
60 2457 vmovl_u16(vget_low_u16(x0)), vget_low_u16(y0),
61 2457 vmovl_u16(vget_low_u16(x1)), vget_low_u16(y1), vget_low_u16(xfrac),
62 2457 vget_low_u16(yfrac), vget_low_u16(nxfrac), vget_low_u16(nyfrac));
63
64 4914 uint16x4_t dst_high = load_and_interpolate(
65 2457 vmovl_high_u16(x0), vget_high_u16(y0), vmovl_high_u16(x1),
66 2457 vget_high_u16(y1), vget_high_u16(xfrac), vget_high_u16(yfrac),
67 2457 vget_high_u16(nxfrac), vget_high_u16(nyfrac));
68
69 2457 vst1_u8(&dst[0], vuzp1_u8(dst_low, dst_high));
70 2457 mapxy += ptrdiff_t(step);
71 2457 mapfrac += ptrdiff_t(step);
72 2457 dst += ptrdiff_t(step);
73 2457 };
74 79 LoopUnroll loop{width, MapVecTraits::num_lanes()};
75 79 loop.unroll_once(vector_path);
76 158 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
77 79 static_cast<ptrdiff_t>(loop.remaining_length());
78 79 mapxy -= back_step;
79 79 mapfrac -= back_step;
80 79 dst -= back_step;
81 116 loop.remaining([&](size_t, size_t step) { vector_path(step); });
82 79 }
83
84 private:
85 4914 uint16x4_t load_and_interpolate(uint32x4_t x0, uint16x4_t y0, uint32x4_t x1,
86 uint16x4_t y1, uint16x4_t xfrac,
87 uint16x4_t yfrac, uint16x4_t nxfrac,
88 uint16x4_t nyfrac) {
89 // Calculate offsets from coordinates (y * stride + x)
90 // a: top left, b: top right, c: bottom left, d: bottom right
91 4914 uint32x4_t offset = vmlal_u16(x0, y0, v_src_stride_);
92 9828 uint64_t acc =
93 9828 static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
94 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
95 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
96 4914 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
97 4914 uint16x4_t a = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
98
99 4914 offset = vmlal_u16(x1, y0, v_src_stride_);
100
101 14742 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
102 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
103 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
104 4914 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
105 4914 uint16x4_t b = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
106
107 4914 uint16x4_t line0 = vmla_u16(vmul_u16(xfrac, b), nxfrac, a);
108
109 4914 offset = vmlal_u16(x0, y1, v_src_stride_);
110
111 14742 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
112 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
113 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
114 4914 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
115 4914 uint16x4_t c = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
116
117 9828 uint32x4_t line0_lerpd = vmlal_u16(
118 4914 vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, nyfrac);
119
120 4914 offset = vmlal_u16(x1, y1, v_src_stride_);
121
122 14742 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
123 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
124 9828 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
125 4914 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
126 4914 uint16x4_t d = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
127
128 4914 uint16x4_t line1 = vmla_u16(vmul_u16(xfrac, d), nxfrac, c);
129 9828 return vshrn_n_u32(vmlal_u16(line0_lerpd, line1, yfrac),
130 2 * REMAP16POINT5_FRAC_BITS);
131 4914 }
132
133 Rows<const ScalarType> src_rows_;
134 uint16x4_t v_src_stride_;
135 int16x8_t v_xmax_;
136 int16x8_t v_ymax_;
137 }; // end of class RemapS16Point5Replicate<uint8_t>
138
139 // Common interpolation function used by all RemapS16Point5 operations except
140 // 1-channel u8 with replicated borders (RemapS16Point5Replicate<uint8_t>)
141 // because that processes one half vector in one step
142 46065 static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c,
143 uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac,
144 uint16x8_t nxfrac, uint16x8_t nyfrac) {
145 230325 auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right,
146 uint16x4_t frac,
147 uint16x4_t nfrac) -> uint32x4_t {
148 184260 return vmlal_u16(vmull_u16(nfrac, left), frac, right);
149 };
150
151 138195 auto interpolate_horizontal_low = [interpolate_horizontal](
152 uint16x8_t left, uint16x8_t right,
153 uint16x8_t frac,
154 uint16x8_t nfrac) -> uint32x4_t {
155 184260 return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right),
156 92130 vget_low_u16(frac), vget_low_u16(nfrac));
157 };
158
159 138195 auto interpolate_horizontal_high = [interpolate_horizontal](
160 uint16x8_t left, uint16x8_t right,
161 uint16x8_t frac,
162 uint16x8_t nfrac) -> uint32x4_t {
163 184260 return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right),
164 92130 vget_high_u16(frac), vget_high_u16(nfrac));
165 };
166
167 // Offset pixel values by 0.5 before rounding down.
168 46065 const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
169
170 138195 auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac,
171 uint32x4_t nfrac) -> uint32x4_t {
172 92130 uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac);
173 184260 return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
174 92130 };
175
176 46065 uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac);
177 46065 uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac);
178 46065 uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac);
179 46065 uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac);
180
181 92130 uint32x4_t lo =
182 92130 interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)),
183 46065 vmovl_u16(vget_low_u16(nyfrac)));
184 92130 uint32x4_t hi = interpolate_vertical(
185 46065 line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac));
186
187 // Discard upper 16 bits of each element (low the precision back to original
188 // 16 bits)
189 92130 uint16x8_t result =
190 46065 vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
191 92130 return result;
192 46065 }
193
194 template <>
195 class RemapS16Point5Replicate<uint16_t> {
196 public:
197 using ScalarType = uint16_t;
198 using MapVecTraits = neon::VecTraits<int16_t>;
199
200 67 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
201 size_t src_height)
202 67 : src_rows_{src_rows},
203 134 v_src_element_stride_{vdupq_n_u16(
204 67 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
205 67 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
206 67 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))},
207 67 xfrac_{vdupq_n_u16(0)},
208 67 yfrac_{vdupq_n_u16(0)},
209 67 nxfrac_{vdupq_n_u16(0)},
210 67 nyfrac_{vdupq_n_u16(0)},
211 67 x0_{vdupq_n_s16(0)},
212 67 x1_{vdupq_n_s16(0)},
213 67 y0_{vdupq_n_s16(0)},
214 67 y1_{vdupq_n_s16(0)} {}
215
216 79 void process_row(size_t width, Columns<const int16_t> mapxy,
217 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
218 2476 auto vector_path = [&](size_t step) {
219 2397 prepare_maps(mapxy, mapfrac);
220 2397 transform_pixels(dst);
221
222 2397 mapxy += ptrdiff_t(step);
223 2397 mapfrac += ptrdiff_t(step);
224 2397 dst += ptrdiff_t(step);
225 2397 };
226 79 LoopUnroll loop{width, MapVecTraits::num_lanes()};
227 79 loop.unroll_once(vector_path);
228 158 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
229 79 static_cast<ptrdiff_t>(loop.remaining_length());
230 79 mapxy -= back_step;
231 79 mapfrac -= back_step;
232 79 dst -= back_step;
233 116 loop.remaining([&](size_t, size_t step) { vector_path(step); });
234 79 }
235
236 2397 void prepare_maps(Columns<const int16_t> mapxy,
237 Columns<const uint16_t> mapfrac) {
238 2397 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
239 2397 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
240 2397 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
241 2397 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
242 4794 xfrac_ = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
243 2397 vandq_u16(frac, frac_mask));
244 2397 yfrac_ = vbslq_u16(
245 2397 vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
246 2397 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask));
247 2397 nxfrac_ = vsubq_u16(frac_max, xfrac_);
248 2397 nyfrac_ = vsubq_u16(frac_max, yfrac_);
249
250 // Clamp coordinates to within the dimensions of the source image
251 2397 x0_ = vreinterpretq_u16_s16(
252 2397 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
253 2397 y0_ = vreinterpretq_u16_s16(
254 2397 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
255
256 // x1 = x0 + 1, except if it's already xmax
257 2397 x1_ = vsubq_u16(x0_, vcltq_s16(xy.val[0], v_xmax_));
258 2397 y1_ = vsubq_u16(y0_, vcltq_s16(xy.val[1], v_ymax_));
259 2397 }
260
261 2397 void transform_pixels(Columns<uint16_t> dst) {
262 2397 uint16x8_t a = load_pixels(x0_, y0_);
263 2397 uint16x8_t b = load_pixels(x1_, y0_);
264 2397 uint16x8_t c = load_pixels(x0_, y1_);
265 2397 uint16x8_t d = load_pixels(x1_, y1_);
266
267 4794 uint16x8_t result =
268 2397 interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
269
270 2397 vst1q_u16(&dst[0], result);
271 2397 }
272
273 9588 uint16x8_t load_pixels(int16x8_t x, int16x8_t y) {
274 // Clamp coordinates to within the dimensions of the source image
275 19176 uint16x8_t x_clamped =
276 9588 vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(x, vdupq_n_s16(0))), v_xmax_);
277 19176 uint16x8_t y_clamped =
278 9588 vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y, vdupq_n_s16(0))), v_ymax_);
279
280 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
281 19176 uint32x4_t indices_low =
282 19176 vmlal_u16(vmovl_u16(vget_low_u16(x_clamped)), vget_low_u16(y_clamped),
283 9588 vget_low_u16(v_src_element_stride_));
284 19176 uint32x4_t indices_high = vmlal_high_u16(vmovl_high_u16(x_clamped),
285 9588 y_clamped, v_src_element_stride_);
286
287 // Read pixels from source
288 86292 uint16x8_t pixels = {
289 9588 src_rows_[vgetq_lane_u32(indices_low, 0)],
290 9588 src_rows_[vgetq_lane_u32(indices_low, 1)],
291 9588 src_rows_[vgetq_lane_u32(indices_low, 2)],
292 9588 src_rows_[vgetq_lane_u32(indices_low, 3)],
293 9588 src_rows_[vgetq_lane_u32(indices_high, 0)],
294 9588 src_rows_[vgetq_lane_u32(indices_high, 1)],
295 9588 src_rows_[vgetq_lane_u32(indices_high, 2)],
296 9588 src_rows_[vgetq_lane_u32(indices_high, 3)],
297 };
298
299 19176 return pixels;
300 9588 }
301
302 private:
303 Rows<const ScalarType> src_rows_;
304 uint16x8_t v_src_element_stride_;
305 int16x8_t v_xmax_;
306 int16x8_t v_ymax_;
307 uint16x8_t xfrac_;
308 uint16x8_t yfrac_;
309 uint16x8_t nxfrac_;
310 uint16x8_t nyfrac_;
311 int16x8_t x0_;
312 int16x8_t x1_;
313 int16x8_t y0_;
314 int16x8_t y1_;
315 }; // end of class RemapS16Point5Replicate<uint16_t>
316
317 template <typename ScalarType>
318 class RemapS16Point5ConstantBorder;
319
320 template <>
321 class RemapS16Point5ConstantBorder<uint8_t> {
322 public:
323 using ScalarType = uint8_t;
324 using MapVecTraits = neon::VecTraits<int16_t>;
325
326 66 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
327 size_t src_width, size_t src_height,
328 const ScalarType *border_value)
329 66 : src_rows_{src_rows},
330 66 v_src_stride_{vdupq_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
331 66 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
332 66 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
333 66 v_border_{vdupq_n_u16(static_cast<uint16_t>(*border_value))} {}
334
335 78 void process_row(size_t width, Columns<const int16_t> mapxy,
336 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
337 2534 auto vector_path = [&](size_t step) {
338 2456 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
339 2456 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
340 2456 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
341 2456 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
342 2456 uint16x8_t xfrac = vandq_u16(frac, frac_mask);
343 4912 uint16x8_t yfrac =
344 2456 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
345 2456 uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac);
346 2456 uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac);
347
348 2456 uint16x8_t one = vdupq_n_u16(1);
349 2456 uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]);
350 2456 uint16x8_t y0 = vreinterpretq_u16_s16(xy.val[1]);
351 2456 uint16x8_t x1 = vaddq_u16(x0, one);
352 2456 uint16x8_t y1 = vaddq_u16(y0, one);
353
354 4912 uint16x8_t a = load_pixels_or_constant_border(
355 2456 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0);
356 4912 uint16x8_t b = load_pixels_or_constant_border(
357 2456 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0);
358 4912 uint16x8_t c = load_pixels_or_constant_border(
359 2456 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1);
360 4912 uint16x8_t d = load_pixels_or_constant_border(
361 2456 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1);
362
363 2456 uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac);
364
365 2456 vst1_u8(&dst[0], vqmovn_u16(result));
366 2456 mapxy += ptrdiff_t(step);
367 2456 mapfrac += ptrdiff_t(step);
368 2456 dst += ptrdiff_t(step);
369 2456 };
370 78 LoopUnroll loop{width, MapVecTraits::num_lanes()};
371 78 loop.unroll_once(vector_path);
372 156 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
373 78 static_cast<ptrdiff_t>(loop.remaining_length());
374 78 mapxy -= back_step;
375 78 mapfrac -= back_step;
376 78 dst -= back_step;
377 115 loop.remaining([&](size_t, size_t step) { vector_path(step); });
378 78 }
379
380 private:
381 9824 uint16x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_,
382 uint16x8_t v_src_element_stride_,
383 uint16x8_t v_width_,
384 uint16x8_t v_height_,
385 uint16x8_t v_border_, uint16x8_t x,
386 uint16x8_t y) {
387 // Find whether coordinates are within the image dimensions.
388 // Negative coordinates are interpreted as large values due to the s16->u16
389 // reinterpretation.
390 19648 uint16x8_t in_range =
391 19648 vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
392 9824 vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
393
394 // Zero out-of-range coordinates.
395 9824 x = vandq_u16(in_range, x);
396 9824 y = vandq_u16(in_range, y);
397
398 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
399 19648 uint32x4_t indices_low =
400 19648 vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
401 9824 vget_low_u16(v_src_element_stride_));
402 19648 uint32x4_t indices_high =
403 9824 vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
404
405 // Read pixels from source
406 88416 uint8x8_t pixels = {
407 9824 src_rows_[vgetq_lane_u32(indices_low, 0)],
408 9824 src_rows_[vgetq_lane_u32(indices_low, 1)],
409 9824 src_rows_[vgetq_lane_u32(indices_low, 2)],
410 9824 src_rows_[vgetq_lane_u32(indices_low, 3)],
411 9824 src_rows_[vgetq_lane_u32(indices_high, 0)],
412 9824 src_rows_[vgetq_lane_u32(indices_high, 1)],
413 9824 src_rows_[vgetq_lane_u32(indices_high, 2)],
414 9824 src_rows_[vgetq_lane_u32(indices_high, 3)],
415 };
416 // Select between source pixels and border colour
417 19648 return vbslq_u16(in_range, vmovl_u8(pixels), v_border_);
418 9824 }
419
420 Rows<const ScalarType> src_rows_;
421 uint16x8_t v_src_stride_;
422 uint16x8_t v_width_;
423 uint16x8_t v_height_;
424 uint16x8_t v_border_;
425 }; // end of class RemapS16Point5ConstantBorder<uint8_t>
426
427 template <>
428 class RemapS16Point5ConstantBorder<uint16_t> {
429 public:
430 using ScalarType = uint16_t;
431 using MapVecTraits = neon::VecTraits<int16_t>;
432
433 66 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
434 size_t src_width, size_t src_height,
435 const ScalarType *border_value)
436 66 : src_rows_{src_rows},
437 132 v_src_element_stride_{vdupq_n_u16(
438 66 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
439 66 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
440 66 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
441 66 v_border_{vdupq_n_u16(*border_value)},
442 66 xfrac_{vdupq_n_u16(0)},
443 66 yfrac_{vdupq_n_u16(0)},
444 66 nxfrac_{vdupq_n_u16(0)},
445 66 nyfrac_{vdupq_n_u16(0)},
446 66 x0_{vdupq_n_s16(0)},
447 66 x1_{vdupq_n_s16(0)},
448 66 y0_{vdupq_n_s16(0)},
449 66 y1_{vdupq_n_s16(0)} {}
450
451 2396 void prepare_maps(Columns<const int16_t> mapxy,
452 Columns<const uint16_t> mapfrac) {
453 2396 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
454 2396 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
455 2396 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
456 2396 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
457 2396 xfrac_ = vandq_u16(frac, frac_mask);
458 2396 yfrac_ = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
459 2396 nxfrac_ = vsubq_u16(frac_max, xfrac_);
460 2396 nyfrac_ = vsubq_u16(frac_max, yfrac_);
461
462 2396 uint16x8_t one = vdupq_n_u16(1);
463 2396 x0_ = xy.val[0];
464 2396 y0_ = xy.val[1];
465 2396 x1_ = vaddq_u16(x0_, one);
466 2396 y1_ = vaddq_u16(y0_, one);
467 2396 }
468
469 78 void process_row(size_t width, Columns<const int16_t> mapxy,
470 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
471 2474 auto vector_path = [&](size_t step) {
472 2396 prepare_maps(mapxy, mapfrac);
473 2396 transform_pixels(dst);
474
475 2396 mapxy += ptrdiff_t(step);
476 2396 mapfrac += ptrdiff_t(step);
477 2396 dst += ptrdiff_t(step);
478 2396 };
479 78 LoopUnroll loop{width, MapVecTraits::num_lanes()};
480 78 loop.unroll_once(vector_path);
481 156 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
482 78 static_cast<ptrdiff_t>(loop.remaining_length());
483 78 mapxy -= back_step;
484 78 mapfrac -= back_step;
485 78 dst -= back_step;
486 115 loop.remaining([&](size_t, size_t step) { vector_path(step); });
487 78 }
488
489 2396 void transform_pixels(Columns<uint16_t> dst) {
490 2396 uint16x8_t a = load_pixels(x0_, y0_);
491 2396 uint16x8_t b = load_pixels(x1_, y0_);
492 2396 uint16x8_t c = load_pixels(x0_, y1_);
493 2396 uint16x8_t d = load_pixels(x1_, y1_);
494
495 4792 uint16x8_t result =
496 2396 interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
497
498 2396 vst1q_u16(&dst[0], result);
499 2396 }
500
501 9584 uint16x8_t load_pixels(uint16x8_t x, uint16x8_t y) {
502 // Find whether coordinates are within the image dimensions.
503 // Negative coordinates are interpreted as large values due to the s16->u16
504 // reinterpretation.
505 19168 uint16x8_t in_range =
506 19168 vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
507 9584 vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
508
509 // Zero out-of-range coordinates.
510 9584 x = vandq_u16(in_range, x);
511 9584 y = vandq_u16(in_range, y);
512
513 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
514 19168 uint32x4_t indices_low =
515 19168 vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
516 9584 vget_low_u16(v_src_element_stride_));
517 19168 uint32x4_t indices_high =
518 9584 vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
519
520 // Read pixels from source
521 86256 uint16x8_t pixels = {
522 9584 src_rows_[vgetq_lane_u32(indices_low, 0)],
523 9584 src_rows_[vgetq_lane_u32(indices_low, 1)],
524 9584 src_rows_[vgetq_lane_u32(indices_low, 2)],
525 9584 src_rows_[vgetq_lane_u32(indices_low, 3)],
526 9584 src_rows_[vgetq_lane_u32(indices_high, 0)],
527 9584 src_rows_[vgetq_lane_u32(indices_high, 1)],
528 9584 src_rows_[vgetq_lane_u32(indices_high, 2)],
529 9584 src_rows_[vgetq_lane_u32(indices_high, 3)],
530 };
531 // Select between source pixels and border colour
532 19168 return vbslq_u16(in_range, pixels, v_border_);
533 9584 }
534
535 private:
536 Rows<const ScalarType> src_rows_;
537 uint16x8_t v_src_element_stride_;
538 uint16x8_t v_width_;
539 uint16x8_t v_height_;
540 uint16x8_t v_border_;
541 uint16x8_t xfrac_;
542 uint16x8_t yfrac_;
543 uint16x8_t nxfrac_;
544 uint16x8_t nyfrac_;
545 int16x8_t x0_;
546 int16x8_t x1_;
547 int16x8_t y0_;
548 int16x8_t y1_;
549 }; // end of class RemapS16Point5ConstantBorder<uint16_t>
550
551 9704 inline void get_coordinates(Columns<const int16_t> mapxy,
552 Columns<const uint16_t> mapfrac, uint16x8_t &x,
553 uint16x8_t &y, uint16x8_t &xfrac,
554 uint16x8_t &yfrac) {
555 9704 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
556 9704 x = xy.val[0];
557 9704 y = xy.val[1];
558
559 9704 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
560 9704 xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
561 19408 yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
562 9704 vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
563 9704 }
564
565 19408 inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1,
566 uint16x4_t y1, uint32x4_t &offsets_a,
567 uint32x4_t &offsets_b, uint32x4_t &offsets_c,
568 uint32x4_t &offsets_d,
569 uint16x4_t v_src_element_stride) {
570 // Multiply by 4 because of channels
571 19408 uint32x4_t x0_scaled = vshll_n_u16(x0, 2);
572 19408 uint32x4_t x1_scaled = vshll_n_u16(x1, 2);
573
574 // Calculate offsets from coordinates (y * element_stride + x)
575 // a: top left, b: top right, c: bottom left, d: bottom right
576 19408 offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride);
577 19408 offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride);
578 19408 offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride);
579 19408 offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride);
580 19408 }
581
582 inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low,
583 uint8_t frac_high) {
584 uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low,
585 frac_high, frac_high, frac_high, frac_high};
586 return vmovl_u8(frac_low_high);
587 }
588
589 157184 inline uint64_t load_32bit(const uint8_t *src) {
590 157184 uint32_t value = 0;
591 157184 memcpy(&value, src, sizeof(uint32_t));
592 314368 return static_cast<uint64_t>(value);
593 157184 }
594
595 39296 inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows,
596 uint32x4_t offsets) {
597 78592 uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) |
598 39296 (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32);
599 78592 uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) |
600 39296 (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32);
601 78592 return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23));
602 39296 }
603
604 4912 inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) {
605 using ScalarType = uint8_t;
606 4912 neon::VecTraits<ScalarType>::store(res, &dst[0]);
607 4912 }
608
609 76672 inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows,
610 uint32x2_t offsets) {
611 153344 return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]),
612 76672 vld1_u16(&src_rows[vget_lane_u32(offsets, 1)]));
613 }
614
615 4792 inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) {
616 using ScalarType = uint16_t;
617 4792 neon::VecTraits<ScalarType>::store(res, &dst[0]);
618 4792 }
619
620 // Replicate border specific functions
621 4852 inline void get_coordinates_replicate(Columns<const int16_t> mapxy,
622 Columns<const uint16_t> mapfrac,
623 uint16x8_t &x0, uint16x8_t &y0,
624 uint16x8_t &x1, uint16x8_t &y1,
625 uint16x8_t &xfrac, uint16x8_t &yfrac,
626 int16x8_t v_xmax, int16x8_t v_ymax) {
627 4852 get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
628
629 // Zero the xfrac (or yfrac) if x (or y) are below zero
630 4852 xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac);
631 4852 yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac);
632
633 // Clamp coordinates to within the dimensions of the source image
634 4852 x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax)));
635 4852 y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax)));
636
637 // x1 = x0 + 1, except if it's already xmax
638 4852 x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax));
639 4852 y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax));
640 4852 }
641
642 4912 inline void load_pixels_u8_4ch_replicate(
643 Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
644 uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b,
645 uint8x16_t &c, uint8x16_t &d) {
646 4912 a = load_4px_4ch(src_rows, offsets_a);
647 4912 b = load_4px_4ch(src_rows, offsets_b);
648 4912 c = load_4px_4ch(src_rows, offsets_c);
649 4912 d = load_4px_4ch(src_rows, offsets_d);
650 4912 }
651
652 4792 inline void load_pixels_u16_4ch_replicate(
653 Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
654 uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo,
655 uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo,
656 uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) {
657 4792 a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
658 4792 b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
659 4792 c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
660 4792 d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
661
662 4792 a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
663 4792 b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
664 4792 c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
665 4792 d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
666 4792 }
667
668 template <typename ScalarType>
669 class RemapS16Point5Replicate4ch;
670
671 template <>
672 class RemapS16Point5Replicate4ch<uint8_t> {
673 public:
674 using ScalarType = uint8_t;
675 using MapVecTraits = neon::VecTraits<int16_t>;
676
677 66 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
678 size_t src_height)
679 66 : src_rows_{src_rows},
680 66 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
681 66 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
682 66 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
683
684 78 void process_row(size_t width, Columns<const int16_t> mapxy,
685 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
686 2534 auto vector_path = [&](size_t step) {
687 2456 uint16x8_t x0, y0, x1, y1;
688 2456 uint16x8_t xfrac, yfrac;
689 4912 get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
690 2456 v_xmax_, v_ymax_);
691
692 2456 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
693 2456 uint8x16_t a, b, c, d;
694 2456 uint8x16x2_t res;
695
696 4912 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
697 2456 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
698 2456 offsets_d, v_src_stride_);
699 4912 load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
700 2456 offsets_d, a, b, c, d);
701
702 // Doubled fractions 001122..., low part
703 2456 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
704 2456 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
705 4912 uint16x8_t nxfrac2 =
706 2456 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
707 4912 uint16x8_t nyfrac2 =
708 2456 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
709 // Quadrupled fractions (00001111) are passed to interpolate
710 4912 uint16x8_t res0 = interpolate(
711 2456 vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
712 2456 vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
713 2456 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
714 4912 uint16x8_t res1 = interpolate(
715 2456 vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
716 2456 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
717 2456 vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
718 2456 res.val[0] =
719 2456 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
720
721 4912 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
722 2456 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
723 2456 offsets_d, v_src_stride_);
724 4912 load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
725 2456 offsets_d, a, b, c, d);
726 // Doubled fractions 001122..., high part
727 2456 xfrac2 = vzip2q(xfrac, xfrac);
728 2456 yfrac2 = vzip2q(yfrac, yfrac);
729 2456 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
730 2456 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
731 // Quadrupled fractions (00001111) are passed to interpolate
732 4912 res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
733 2456 vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
734 2456 vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
735 2456 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
736 4912 res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
737 2456 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
738 2456 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
739 2456 vzip2q(nyfrac2, nyfrac2));
740 2456 res.val[1] =
741 2456 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
742
743 2456 store_pixels_u8_4ch(res, dst);
744 2456 mapxy += ptrdiff_t(step);
745 2456 mapfrac += ptrdiff_t(step);
746 2456 dst += ptrdiff_t(step);
747 2456 };
748
749 78 LoopUnroll loop{width, MapVecTraits::num_lanes()};
750 78 loop.unroll_once(vector_path);
751 156 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
752 78 static_cast<ptrdiff_t>(loop.remaining_length());
753 78 mapxy -= back_step;
754 78 mapfrac -= back_step;
755 78 dst -= back_step;
756 115 loop.remaining([&](size_t, size_t step) { vector_path(step); });
757 78 }
758
759 private:
760 Rows<const ScalarType> src_rows_;
761 uint16x4_t v_src_stride_;
762 int16x8_t v_xmax_;
763 int16x8_t v_ymax_;
764 }; // end of class RemapS16Point5Replicate4ch<uint8_t>
765
766 template <>
767 class RemapS16Point5Replicate4ch<uint16_t> {
768 public:
769 using ScalarType = uint16_t;
770 using MapVecTraits = neon::VecTraits<int16_t>;
771
772 66 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
773 size_t src_height)
774 66 : src_rows_{src_rows},
775 132 v_src_element_stride_{vdup_n_u16(
776 66 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
777 66 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
778 66 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
779
780 78 void process_row(size_t width, Columns<const int16_t> mapxy,
781 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
782 2474 auto vector_path = [&](size_t step) {
783 2396 uint16x8_t x0, y0, x1, y1;
784 2396 uint16x8_t xfrac, yfrac;
785 4792 get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
786 2396 v_xmax_, v_ymax_);
787
788 2396 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
789 2396 uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
790 2396 uint16x8x4_t res;
791 4792 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
792 2396 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
793 2396 offsets_d, v_src_element_stride_);
794 4792 load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
795 2396 offsets_d, a_low, a_high, b_low, b_high,
796 c_low, c_high, d_low, d_high);
797
798 // Doubled fractions 001122..., low part
799 2396 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
800 2396 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
801 4792 uint16x8_t nxfrac2 =
802 2396 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
803 4792 uint16x8_t nyfrac2 =
804 2396 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
805 // Quadrupled fractions (00001111) are passed to interpolate
806 2396 res.val[0] =
807 4792 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
808 2396 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
809 2396 vzip1q(nyfrac2, nyfrac2));
810 2396 res.val[1] =
811 4792 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
812 2396 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
813 2396 vzip2q(nyfrac2, nyfrac2));
814
815 4792 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
816 2396 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
817 2396 offsets_d, v_src_element_stride_);
818 4792 load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
819 2396 offsets_d, a_low, a_high, b_low, b_high,
820 c_low, c_high, d_low, d_high);
821 // Doubled fractions 001122..., high part
822 2396 xfrac2 = vzip2q(xfrac, xfrac);
823 2396 yfrac2 = vzip2q(yfrac, yfrac);
824 2396 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
825 2396 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
826 // Quadrupled fractions (00001111) are passed to interpolate
827 2396 res.val[2] =
828 4792 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
829 2396 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
830 2396 vzip1q(nyfrac2, nyfrac2));
831 2396 res.val[3] =
832 4792 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
833 2396 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
834 2396 vzip2q(nyfrac2, nyfrac2));
835
836 2396 store_pixels_u16_4ch(res, dst);
837 2396 mapxy += ptrdiff_t(step);
838 2396 mapfrac += ptrdiff_t(step);
839 2396 dst += ptrdiff_t(step);
840 2396 };
841
842 78 LoopUnroll loop{width, MapVecTraits::num_lanes()};
843 78 loop.unroll_once(vector_path);
844 156 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
845 78 static_cast<ptrdiff_t>(loop.remaining_length());
846 78 mapxy -= back_step;
847 78 mapfrac -= back_step;
848 78 dst -= back_step;
849 115 loop.remaining([&](size_t, size_t step) { vector_path(step); });
850 78 }
851
852 private:
853 Rows<const ScalarType> src_rows_;
854 uint16x4_t v_src_element_stride_;
855 int16x8_t v_xmax_;
856 int16x8_t v_ymax_;
857 }; // end of class RemapS16Point5Replicate4ch<uint16_t>
858
859 // Constant border specific functions
860 4852 inline void get_coordinates_constant(
861 Columns<const int16_t> mapxy, Columns<const uint16_t> mapfrac,
862 uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0,
863 uint16x8_t &x1, uint16x8_t &y1, uint16x8_t &xfrac, uint16x8_t &yfrac,
864 uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c,
865 uint16x8_t &in_range_d) {
866 4852 get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
867
868 4852 uint16x8_t one = vdupq_n_u16(1);
869 4852 x1 = vaddq_u16(x0, one);
870 4852 y1 = vaddq_u16(y0, one);
871
872 4852 uint16x8_t x0_in_range = vcltq_u16(x0, v_width);
873 4852 uint16x8_t y0_in_range = vcltq_u16(y0, v_height);
874 4852 uint16x8_t x1_in_range = vcltq_u16(x1, v_width);
875 4852 uint16x8_t y1_in_range = vcltq_u16(y1, v_height);
876
877 4852 in_range_a = vandq(x0_in_range, y0_in_range);
878 4852 in_range_b = vandq(x1_in_range, y0_in_range);
879 4852 in_range_c = vandq(x0_in_range, y1_in_range);
880 4852 in_range_d = vandq(x1_in_range, y1_in_range);
881 4852 }
882
883 38816 inline uint32x4_t zero_out_of_range_offsets(uint32x4_t in_range,
884 uint32x4_t offsets) {
885 38816 return vbslq_u32(in_range, offsets, vdupq_n_u32(0));
886 }
887
888 19648 inline uint8x16_t replace_pixel_with_border_u8_4ch(uint32x4_t in_range,
889 uint8x16_t pixels,
890 uint8x16_t v_border) {
891 19648 return vreinterpretq_u8_u32(
892 19648 vbslq_u32(in_range, vreinterpretq_u32_u8(pixels), v_border));
893 }
894
895 38336 inline uint16x8_t replace_pixel_with_border_u16_4ch(uint64x2_t in_range,
896 uint16x8_t pixels,
897 uint16x8_t v_border) {
898 38336 return vreinterpretq_u16_u64(
899 38336 vbslq_u64(in_range, vreinterpretq_u64_u16(pixels), v_border));
900 }
901
902 4912 inline void load_pixels_u8_4ch_constant(
903 Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
904 uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
905 uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
906 uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c,
907 uint8x16_t &d) {
908 4912 offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
909 4912 offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
910 4912 offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
911 4912 offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
912
913 4912 a = load_4px_4ch(src_rows, offsets_a);
914 4912 b = load_4px_4ch(src_rows, offsets_b);
915 4912 c = load_4px_4ch(src_rows, offsets_c);
916 4912 d = load_4px_4ch(src_rows, offsets_d);
917
918 4912 a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border);
919 4912 b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border);
920 4912 c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border);
921 4912 d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border);
922 4912 }
923
924 4792 inline void load_pixels_u16_4ch_constant(
925 Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
926 uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
927 uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
928 uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo,
929 uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo,
930 uint16x8_t &d_hi) {
931 4792 offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
932 4792 offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
933 4792 offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
934 4792 offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
935
936 4792 a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
937 4792 b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
938 4792 c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
939 4792 d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
940
941 // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
942 23960 auto low32_to_u64 = [](uint32x4_t bitset) {
943 19168 return vreinterpretq_u64_s64(
944 19168 vmovl_s32(vreinterpret_s32_u32(vget_low_u32(bitset))));
945 };
946
947 9584 a_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_a), a_lo,
948 4792 v_border);
949 9584 b_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_b), b_lo,
950 4792 v_border);
951 9584 c_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_c), c_lo,
952 4792 v_border);
953 9584 d_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_d), d_lo,
954 4792 v_border);
955
956 4792 a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
957 4792 b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
958 4792 c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
959 4792 d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
960
961 // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
962 23960 auto hi32_to_u64 = [](uint32x4_t bitset) {
963 19168 return vreinterpretq_u64_s64(vmovl_high_s32(vreinterpretq_s32_u32(bitset)));
964 };
965
966 9584 a_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_a), a_hi,
967 4792 v_border);
968 9584 b_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_b), b_hi,
969 4792 v_border);
970 9584 c_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_c), c_hi,
971 4792 v_border);
972 9584 d_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_d), d_hi,
973 4792 v_border);
974 4792 }
975
976 // Convert bitsets such as in_range to 32bits, making all 1s or all 0s
977 19408 static uint32x4_t low16_to_s32(uint16x8_t bitset) {
978 19408 return vreinterpretq_u32_s32(
979 19408 vmovl_s16(vreinterpret_s16_u16(vget_low_u16(bitset))));
980 }
981
982 19408 static uint32x4_t hi16_to_s32(uint16x8_t bitset) {
983 19408 return vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(bitset)));
984 }
985
986 template <typename ScalarType>
987 class RemapS16Point5Constant4ch;
988
989 template <>
990 class RemapS16Point5Constant4ch<uint8_t> {
991 public:
992 using ScalarType = uint8_t;
993 using MapVecTraits = neon::VecTraits<int16_t>;
994
995 66 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
996 size_t src_height, const ScalarType *border_value)
997 66 : src_rows_{src_rows},
998 66 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
999 66 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
1000 66 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
1001 66 v_border_{} {
1002 66 uint32_t border_value_32{};
1003 66 memcpy(&border_value_32, border_value, sizeof(uint32_t));
1004 66 v_border_ = vreinterpretq_u8_u32(vdupq_n_u32(border_value_32));
1005 66 }
1006
1007 78 void process_row(size_t width, Columns<const int16_t> mapxy,
1008 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
1009 2534 auto vector_path = [&](size_t step) {
1010 2456 uint16x8_t x0, y0, x1, y1;
1011 2456 uint16x8_t xfrac, yfrac;
1012 2456 uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
1013 2456 get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
1014 y1, xfrac, yfrac, in_range_a, in_range_b,
1015 in_range_c, in_range_d);
1016
1017 2456 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
1018 2456 uint8x16_t a, b, c, d;
1019 2456 uint8x16x2_t res;
1020
1021 4912 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
1022 2456 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
1023 2456 offsets_d, v_src_stride_);
1024
1025 2456 load_pixels_u8_4ch_constant(
1026 2456 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1027 2456 low16_to_s32(in_range_a), low16_to_s32(in_range_b),
1028 2456 low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a, b,
1029 c, d);
1030
1031 // Doubled fractions 001122..., low part
1032 2456 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
1033 2456 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
1034 4912 uint16x8_t nxfrac2 =
1035 2456 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1036 4912 uint16x8_t nyfrac2 =
1037 2456 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1038 // Quadrupled fractions (00001111) are passed to interpolate
1039 4912 uint16x8_t res0 = interpolate(
1040 2456 vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
1041 2456 vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
1042 2456 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
1043 4912 uint16x8_t res1 = interpolate(
1044 2456 vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
1045 2456 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
1046 2456 vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
1047 2456 res.val[0] =
1048 2456 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
1049
1050 4912 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
1051 2456 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
1052 2456 offsets_d, v_src_stride_);
1053
1054 2456 load_pixels_u8_4ch_constant(
1055 2456 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1056 2456 hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
1057 2456 hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a, b, c,
1058 d);
1059 // Doubled fractions 001122..., high part
1060 2456 xfrac2 = vzip2q(xfrac, xfrac);
1061 2456 yfrac2 = vzip2q(yfrac, yfrac);
1062 2456 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1063 2456 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1064 // Quadrupled fractions (00001111) are passed to interpolate
1065 4912 res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
1066 2456 vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
1067 2456 vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
1068 2456 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
1069 4912 res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
1070 2456 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
1071 2456 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1072 2456 vzip2q(nyfrac2, nyfrac2));
1073 2456 res.val[1] =
1074 2456 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
1075
1076 2456 store_pixels_u8_4ch(res, dst);
1077 2456 mapxy += ptrdiff_t(step);
1078 2456 mapfrac += ptrdiff_t(step);
1079 2456 dst += ptrdiff_t(step);
1080 2456 };
1081
1082 78 LoopUnroll loop{width, MapVecTraits::num_lanes()};
1083 78 loop.unroll_once(vector_path);
1084 156 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
1085 78 static_cast<ptrdiff_t>(loop.remaining_length());
1086 78 mapxy -= back_step;
1087 78 mapfrac -= back_step;
1088 78 dst -= back_step;
1089 115 loop.remaining([&](size_t, size_t step) { vector_path(step); });
1090 78 }
1091
1092 private:
1093 Rows<const ScalarType> src_rows_;
1094 uint16x4_t v_src_stride_;
1095 uint16x8_t v_width_;
1096 uint16x8_t v_height_;
1097 uint8x16_t v_border_;
1098 }; // end of class RemapS16Point5Constant4ch<uint8_t>
1099
1100 template <>
1101 class RemapS16Point5Constant4ch<uint16_t> {
1102 public:
1103 using ScalarType = uint16_t;
1104 using MapVecTraits = neon::VecTraits<int16_t>;
1105
1106 66 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
1107 size_t src_height, const ScalarType *border_value)
1108 66 : src_rows_{src_rows},
1109 132 v_src_element_stride_{vdup_n_u16(
1110 66 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
1111 66 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
1112 66 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
1113 66 v_border_{} {
1114 66 uint64_t border_value_64{};
1115 66 memcpy(&border_value_64, border_value, sizeof(uint64_t));
1116 66 v_border_ = vreinterpretq_u16_u64(vdupq_n_u64(border_value_64));
1117 66 }
1118
1119 78 void process_row(size_t width, Columns<const int16_t> mapxy,
1120 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
1121 2474 auto vector_path = [&](size_t step) {
1122 2396 uint16x8_t x0, y0, x1, y1;
1123 2396 uint16x8_t xfrac, yfrac;
1124 2396 uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
1125 2396 get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
1126 y1, xfrac, yfrac, in_range_a, in_range_b,
1127 in_range_c, in_range_d);
1128
1129 2396 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
1130 2396 uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
1131 2396 uint16x8x4_t res;
1132
1133 4792 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
1134 2396 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
1135 2396 offsets_d, v_src_element_stride_);
1136
1137 2396 load_pixels_u16_4ch_constant(
1138 2396 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1139 2396 low16_to_s32(in_range_a), low16_to_s32(in_range_b),
1140 2396 low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a_low,
1141 a_high, b_low, b_high, c_low, c_high, d_low, d_high);
1142
1143 // Doubled fractions 001122..., low part
1144 2396 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
1145 2396 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
1146 4792 uint16x8_t nxfrac2 =
1147 2396 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1148 4792 uint16x8_t nyfrac2 =
1149 2396 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1150 // Quadrupled fractions (00001111) are passed to interpolate
1151 2396 res.val[0] =
1152 4792 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
1153 2396 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
1154 2396 vzip1q(nyfrac2, nyfrac2));
1155 2396 res.val[1] =
1156 4792 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
1157 2396 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1158 2396 vzip2q(nyfrac2, nyfrac2));
1159
1160 4792 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
1161 2396 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
1162 2396 offsets_d, v_src_element_stride_);
1163
1164 2396 load_pixels_u16_4ch_constant(
1165 2396 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1166 2396 hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
1167 2396 hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a_low,
1168 a_high, b_low, b_high, c_low, c_high, d_low, d_high);
1169
1170 // Doubled fractions 001122..., high part
1171 2396 xfrac2 = vzip2q(xfrac, xfrac);
1172 2396 yfrac2 = vzip2q(yfrac, yfrac);
1173 2396 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1174 2396 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1175 // Quadrupled fractions (00001111) are passed to interpolate
1176 2396 res.val[2] =
1177 4792 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
1178 2396 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
1179 2396 vzip1q(nyfrac2, nyfrac2));
1180 2396 res.val[3] =
1181 4792 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
1182 2396 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1183 2396 vzip2q(nyfrac2, nyfrac2));
1184
1185 2396 store_pixels_u16_4ch(res, dst);
1186 2396 mapxy += ptrdiff_t(step);
1187 2396 mapfrac += ptrdiff_t(step);
1188 2396 dst += ptrdiff_t(step);
1189 2396 };
1190
1191 78 LoopUnroll loop{width, MapVecTraits::num_lanes()};
1192 78 loop.unroll_once(vector_path);
1193 156 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
1194 78 static_cast<ptrdiff_t>(loop.remaining_length());
1195 78 mapxy -= back_step;
1196 78 mapfrac -= back_step;
1197 78 dst -= back_step;
1198 115 loop.remaining([&](size_t, size_t step) { vector_path(step); });
1199 78 }
1200
1201 private:
1202 Rows<const ScalarType> src_rows_;
1203 uint16x4_t v_src_element_stride_;
1204 uint16x8_t v_width_;
1205 uint16x8_t v_height_;
1206 uint16x8_t v_border_;
1207 }; // end of class RemapS16Point5Constant4ch<uint16_t>
1208
1209 // Most of the complexity comes from parameter checking.
1210 // NOLINTBEGIN(readability-function-cognitive-complexity)
1211 template <typename T>
1212 570 kleidicv_error_t remap_s16point5(
1213 const T *src, size_t src_stride, size_t src_width, size_t src_height,
1214 T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1215 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
1216 const uint16_t *mapfrac, size_t mapfrac_stride,
1217 [[maybe_unused]] kleidicv_border_type_t border_type,
1218 [[maybe_unused]] const T *border_value) {
1219
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 284 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 284 times.
570 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1220
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 283 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 283 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 283 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 283 times.
568 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1221
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 282 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 282 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 282 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 282 times.
566 CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
1222
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 281 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 281 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 281 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 281 times.
564 CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height);
1223
12/12
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 280 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 278 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 278 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 280 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 278 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 278 times.
562 CHECK_IMAGE_SIZE(src_width, src_height);
1224
12/12
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 277 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 276 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 276 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 277 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 276 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 276 times.
556 CHECK_IMAGE_SIZE(dst_width, dst_height);
1225
8/8
✓ Branch 0 taken 134 times.
✓ Branch 1 taken 142 times.
✓ Branch 2 taken 133 times.
✓ Branch 3 taken 1 times.
✓ Branch 4 taken 134 times.
✓ Branch 5 taken 142 times.
✓ Branch 6 taken 133 times.
✓ Branch 7 taken 1 times.
552 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
1226 2 return KLEIDICV_ERROR_NULL_POINTER;
1227 }
1228
1229
8/8
✓ Branch 0 taken 265 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 265 times.
✓ Branch 3 taken 10 times.
✓ Branch 4 taken 265 times.
✓ Branch 5 taken 10 times.
✓ Branch 6 taken 265 times.
✓ Branch 7 taken 10 times.
1100 if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height,
1230 550 dst_width, border_type, channels)) {
1231 20 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1232 }
1233
1234 530 Rows<const T> src_rows{src, src_stride, channels};
1235 530 Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
1236 530 Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1};
1237 530 Rows<T> dst_rows{dst, dst_stride, channels};
1238 530 Rectangle rect{dst_width, dst_height};
1239
4/4
✓ Branch 0 taken 133 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 133 times.
✓ Branch 3 taken 132 times.
530 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
1240
4/4
✓ Branch 0 taken 66 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 66 times.
✓ Branch 3 taken 66 times.
264 if (channels == 1) {
1241 264 RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height,
1242 132 border_value};
1243 132 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1244 132 } else {
1245 assert(channels == 4);
1246 264 RemapS16Point5Constant4ch<T> operation{src_rows, src_width, src_height,
1247 132 border_value};
1248 132 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1249 132 }
1250 264 } else {
1251 assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
1252
4/4
✓ Branch 0 taken 67 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 67 times.
✓ Branch 3 taken 66 times.
266 if (channels == 1) {
1253 134 RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height};
1254 134 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1255 134 } else {
1256 assert(channels == 4);
1257 132 RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height};
1258 132 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1259 132 }
1260 }
1261 530 return KLEIDICV_OK;
1262 570 }
1263 // NOLINTEND(readability-function-cognitive-complexity)
1264
1265 #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \
1266 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \
1267 const type *src, size_t src_stride, size_t src_width, size_t src_height, \
1268 type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \
1269 size_t channels, const int16_t *mapxy, size_t mapxy_stride, \
1270 const uint16_t *mapfrac, size_t mapfrac_stride, \
1271 kleidicv_border_type_t border_type, const type *border_value)
1272
1273 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
1274 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
1275
1276 } // namespace kleidicv::neon
1277