KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/remap_s16point5_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 790 790 100.0%
Functions: 67 67 100.0%
Branches: 84 84 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6
7 #include "kleidicv/neon.h"
8 #include "kleidicv/transform/remap.h"
9
10 namespace kleidicv::neon {
11
12 template <typename ScalarType>
13 class RemapS16Point5Replicate;
14
15 template <>
16 class RemapS16Point5Replicate<uint8_t> {
17 public:
18 using ScalarType = uint8_t;
19 using MapVecTraits = neon::VecTraits<int16_t>;
20 using MapVectorType = typename MapVecTraits::VectorType;
21 using MapVector2Type = typename MapVecTraits::Vector2Type;
22 using FracVecTraits = neon::VecTraits<uint16_t>;
23 using FracVectorType = typename FracVecTraits::VectorType;
24
25 170 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
26 size_t src_height)
27 170 : src_rows_{src_rows},
28 170 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
29 170 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
30 170 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
31
32 194 void process_row(size_t width, Columns<const int16_t> mapxy,
33 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
34 5492 auto vector_path = [&](size_t step) {
35 5298 MapVector2Type xy = vld2q_s16(&mapxy[0]);
36 5298 FracVectorType frac = vld1q_u16(&mapfrac[0]);
37 10596 uint16x8_t xfrac =
38 10596 vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
39 // extract xfrac = frac[0:4]
40 5298 vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
41 10596 uint16x8_t yfrac =
42 10596 vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
43 // extract yfrac = frac[5:9]
44 10596 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
45 5298 vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
46 5298 uint16x8_t nxfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
47 5298 uint16x8_t nyfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
48
49 // Clamp coordinates to within the dimensions of the source image
50 10596 uint16x8_t x0 = vreinterpretq_u16_s16(
51 5298 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
52 10596 uint16x8_t y0 = vreinterpretq_u16_s16(
53 5298 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
54
55 // x1 = x0 + 1, except if it's already xmax
56 5298 uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_));
57 5298 uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_));
58
59 10596 uint16x4_t dst_low = load_and_interpolate(
60 5298 vmovl_u16(vget_low_u16(x0)), vget_low_u16(y0),
61 5298 vmovl_u16(vget_low_u16(x1)), vget_low_u16(y1), vget_low_u16(xfrac),
62 5298 vget_low_u16(yfrac), vget_low_u16(nxfrac), vget_low_u16(nyfrac));
63
64 10596 uint16x4_t dst_high = load_and_interpolate(
65 5298 vmovl_high_u16(x0), vget_high_u16(y0), vmovl_high_u16(x1),
66 5298 vget_high_u16(y1), vget_high_u16(xfrac), vget_high_u16(yfrac),
67 5298 vget_high_u16(nxfrac), vget_high_u16(nyfrac));
68
69 5298 vst1_u8(&dst[0], vuzp1_u8(dst_low, dst_high));
70 5298 mapxy += ptrdiff_t(step);
71 5298 mapfrac += ptrdiff_t(step);
72 5298 dst += ptrdiff_t(step);
73 5298 };
74 194 LoopUnroll loop{width, MapVecTraits::num_lanes()};
75 194 loop.unroll_once(vector_path);
76 388 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
77 194 static_cast<ptrdiff_t>(loop.remaining_length());
78 194 mapxy -= back_step;
79 194 mapfrac -= back_step;
80 194 dst -= back_step;
81 306 loop.remaining([&](size_t, size_t step) { vector_path(step); });
82 194 }
83
84 private:
85 10596 uint16x4_t load_and_interpolate(uint32x4_t x0, uint16x4_t y0, uint32x4_t x1,
86 uint16x4_t y1, uint16x4_t xfrac,
87 uint16x4_t yfrac, uint16x4_t nxfrac,
88 uint16x4_t nyfrac) {
89 // Calculate offsets from coordinates (y * stride + x)
90 // a: top left, b: top right, c: bottom left, d: bottom right
91 10596 uint32x4_t offset = vmlal_u16(x0, y0, v_src_stride_);
92 21192 uint64_t acc =
93 21192 static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
94 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
95 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
96 10596 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
97 10596 uint16x4_t a = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
98
99 10596 offset = vmlal_u16(x1, y0, v_src_stride_);
100
101 31788 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
102 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
103 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
104 10596 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
105 10596 uint16x4_t b = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
106
107 10596 uint16x4_t line0 = vmla_u16(vmul_u16(xfrac, b), nxfrac, a);
108
109 10596 offset = vmlal_u16(x0, y1, v_src_stride_);
110
111 31788 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
112 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
113 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
114 10596 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
115 10596 uint16x4_t c = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
116
117 21192 uint32x4_t line0_lerpd = vmlal_u16(
118 10596 vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, nyfrac);
119
120 10596 offset = vmlal_u16(x1, y1, v_src_stride_);
121
122 31788 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
123 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
124 21192 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
125 10596 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
126 10596 uint16x4_t d = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
127
128 10596 uint16x4_t line1 = vmla_u16(vmul_u16(xfrac, d), nxfrac, c);
129 21192 return vshrn_n_u32(vmlal_u16(line0_lerpd, line1, yfrac),
130 2 * REMAP16POINT5_FRAC_BITS);
131 10596 }
132
133 Rows<const ScalarType> src_rows_;
134 uint16x4_t v_src_stride_;
135 int16x8_t v_xmax_;
136 int16x8_t v_ymax_;
137 }; // end of class RemapS16Point5Replicate<uint8_t>
138
139 // Common interpolation function used by all RemapS16Point5 operations except
140 // 1-channel u8 with replicated borders (RemapS16Point5Replicate<uint8_t>)
141 // because that processes one half vector in one step
142 97626 static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c,
143 uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac,
144 uint16x8_t nxfrac, uint16x8_t nyfrac) {
145 488130 auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right,
146 uint16x4_t frac,
147 uint16x4_t nfrac) -> uint32x4_t {
148 390504 return vmlal_u16(vmull_u16(nfrac, left), frac, right);
149 };
150
151 292878 auto interpolate_horizontal_low = [interpolate_horizontal](
152 uint16x8_t left, uint16x8_t right,
153 uint16x8_t frac,
154 uint16x8_t nfrac) -> uint32x4_t {
155 390504 return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right),
156 195252 vget_low_u16(frac), vget_low_u16(nfrac));
157 };
158
159 292878 auto interpolate_horizontal_high = [interpolate_horizontal](
160 uint16x8_t left, uint16x8_t right,
161 uint16x8_t frac,
162 uint16x8_t nfrac) -> uint32x4_t {
163 390504 return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right),
164 195252 vget_high_u16(frac), vget_high_u16(nfrac));
165 };
166
167 // Offset pixel values by 0.5 before rounding down.
168 97626 const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
169
170 292878 auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac,
171 uint32x4_t nfrac) -> uint32x4_t {
172 195252 uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac);
173 390504 return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
174 195252 };
175
176 97626 uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac);
177 97626 uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac);
178 97626 uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac);
179 97626 uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac);
180
181 195252 uint32x4_t lo =
182 195252 interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)),
183 97626 vmovl_u16(vget_low_u16(nyfrac)));
184 195252 uint32x4_t hi = interpolate_vertical(
185 97626 line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac));
186
187 // Discard upper 16 bits of each element (low the precision back to original
188 // 16 bits)
189 195252 uint16x8_t result =
190 97626 vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
191 195252 return result;
192 97626 }
193
194 template <>
195 class RemapS16Point5Replicate<uint16_t> {
196 public:
197 using ScalarType = uint16_t;
198 using MapVecTraits = neon::VecTraits<int16_t>;
199
200 170 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
201 size_t src_height)
202 170 : src_rows_{src_rows},
203 340 v_src_element_stride_{vdupq_n_u16(
204 170 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
205 170 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
206 170 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))},
207 170 xfrac_{vdupq_n_u16(0)},
208 170 yfrac_{vdupq_n_u16(0)},
209 170 nxfrac_{vdupq_n_u16(0)},
210 170 nyfrac_{vdupq_n_u16(0)},
211 170 x0_{vdupq_n_s16(0)},
212 170 x1_{vdupq_n_s16(0)},
213 170 y0_{vdupq_n_s16(0)},
214 170 y1_{vdupq_n_s16(0)} {}
215
216 194 void process_row(size_t width, Columns<const int16_t> mapxy,
217 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
218 5192 auto vector_path = [&](size_t step) {
219 4998 prepare_maps(mapxy, mapfrac);
220 4998 transform_pixels(dst);
221
222 4998 mapxy += ptrdiff_t(step);
223 4998 mapfrac += ptrdiff_t(step);
224 4998 dst += ptrdiff_t(step);
225 4998 };
226 194 LoopUnroll loop{width, MapVecTraits::num_lanes()};
227 194 loop.unroll_once(vector_path);
228 388 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
229 194 static_cast<ptrdiff_t>(loop.remaining_length());
230 194 mapxy -= back_step;
231 194 mapfrac -= back_step;
232 194 dst -= back_step;
233 306 loop.remaining([&](size_t, size_t step) { vector_path(step); });
234 194 }
235
236 4998 void prepare_maps(Columns<const int16_t> mapxy,
237 Columns<const uint16_t> mapfrac) {
238 4998 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
239 4998 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
240 4998 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
241 4998 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
242 9996 xfrac_ = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
243 4998 vandq_u16(frac, frac_mask));
244 4998 yfrac_ = vbslq_u16(
245 4998 vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
246 4998 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask));
247 4998 nxfrac_ = vsubq_u16(frac_max, xfrac_);
248 4998 nyfrac_ = vsubq_u16(frac_max, yfrac_);
249
250 // Clamp coordinates to within the dimensions of the source image
251 4998 x0_ = vreinterpretq_u16_s16(
252 4998 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
253 4998 y0_ = vreinterpretq_u16_s16(
254 4998 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
255
256 // x1 = x0 + 1, except if it's already xmax
257 4998 x1_ = vsubq_u16(x0_, vcltq_s16(xy.val[0], v_xmax_));
258 4998 y1_ = vsubq_u16(y0_, vcltq_s16(xy.val[1], v_ymax_));
259 4998 }
260
261 4998 void transform_pixels(Columns<uint16_t> dst) {
262 4998 uint16x8_t a = load_pixels(x0_, y0_);
263 4998 uint16x8_t b = load_pixels(x1_, y0_);
264 4998 uint16x8_t c = load_pixels(x0_, y1_);
265 4998 uint16x8_t d = load_pixels(x1_, y1_);
266
267 9996 uint16x8_t result =
268 4998 interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
269
270 4998 vst1q_u16(&dst[0], result);
271 4998 }
272
273 19992 uint16x8_t load_pixels(int16x8_t x, int16x8_t y) {
274 // Clamp coordinates to within the dimensions of the source image
275 39984 uint16x8_t x_clamped =
276 19992 vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(x, vdupq_n_s16(0))), v_xmax_);
277 39984 uint16x8_t y_clamped =
278 19992 vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y, vdupq_n_s16(0))), v_ymax_);
279
280 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
281 39984 uint32x4_t indices_low =
282 39984 vmlal_u16(vmovl_u16(vget_low_u16(x_clamped)), vget_low_u16(y_clamped),
283 19992 vget_low_u16(v_src_element_stride_));
284 39984 uint32x4_t indices_high = vmlal_high_u16(vmovl_high_u16(x_clamped),
285 19992 y_clamped, v_src_element_stride_);
286
287 // Read pixels from source
288 179928 uint16x8_t pixels = {
289 19992 src_rows_[vgetq_lane_u32(indices_low, 0)],
290 19992 src_rows_[vgetq_lane_u32(indices_low, 1)],
291 19992 src_rows_[vgetq_lane_u32(indices_low, 2)],
292 19992 src_rows_[vgetq_lane_u32(indices_low, 3)],
293 19992 src_rows_[vgetq_lane_u32(indices_high, 0)],
294 19992 src_rows_[vgetq_lane_u32(indices_high, 1)],
295 19992 src_rows_[vgetq_lane_u32(indices_high, 2)],
296 19992 src_rows_[vgetq_lane_u32(indices_high, 3)],
297 };
298
299 39984 return pixels;
300 19992 }
301
302 private:
303 Rows<const ScalarType> src_rows_;
304 uint16x8_t v_src_element_stride_;
305 int16x8_t v_xmax_;
306 int16x8_t v_ymax_;
307 uint16x8_t xfrac_;
308 uint16x8_t yfrac_;
309 uint16x8_t nxfrac_;
310 uint16x8_t nyfrac_;
311 int16x8_t x0_;
312 int16x8_t x1_;
313 int16x8_t y0_;
314 int16x8_t y1_;
315 }; // end of class RemapS16Point5Replicate<uint16_t>
316
317 template <typename ScalarType>
318 class RemapS16Point5ConstantBorder;
319
320 template <>
321 class RemapS16Point5ConstantBorder<uint8_t> {
322 public:
323 using ScalarType = uint8_t;
324 using MapVecTraits = neon::VecTraits<int16_t>;
325
326 168 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
327 size_t src_width, size_t src_height,
328 const ScalarType *border_value)
329 168 : src_rows_{src_rows},
330 168 v_src_stride_{vdupq_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
331 168 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
332 168 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
333 168 v_border_{vdupq_n_u16(static_cast<uint16_t>(*border_value))} {}
334
335 192 void process_row(size_t width, Columns<const int16_t> mapxy,
336 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
337 5488 auto vector_path = [&](size_t step) {
338 5296 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
339 5296 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
340 5296 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
341 5296 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
342 5296 uint16x8_t xfrac = vandq_u16(frac, frac_mask);
343 10592 uint16x8_t yfrac =
344 5296 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
345 5296 uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac);
346 5296 uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac);
347
348 5296 uint16x8_t one = vdupq_n_u16(1);
349 5296 uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]);
350 5296 uint16x8_t y0 = vreinterpretq_u16_s16(xy.val[1]);
351 5296 uint16x8_t x1 = vaddq_u16(x0, one);
352 5296 uint16x8_t y1 = vaddq_u16(y0, one);
353
354 10592 uint16x8_t a = load_pixels_or_constant_border(
355 5296 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0);
356 10592 uint16x8_t b = load_pixels_or_constant_border(
357 5296 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0);
358 10592 uint16x8_t c = load_pixels_or_constant_border(
359 5296 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1);
360 10592 uint16x8_t d = load_pixels_or_constant_border(
361 5296 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1);
362
363 5296 uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac);
364
365 5296 vst1_u8(&dst[0], vqmovn_u16(result));
366 5296 mapxy += ptrdiff_t(step);
367 5296 mapfrac += ptrdiff_t(step);
368 5296 dst += ptrdiff_t(step);
369 5296 };
370 192 LoopUnroll loop{width, MapVecTraits::num_lanes()};
371 192 loop.unroll_once(vector_path);
372 384 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
373 192 static_cast<ptrdiff_t>(loop.remaining_length());
374 192 mapxy -= back_step;
375 192 mapfrac -= back_step;
376 192 dst -= back_step;
377 304 loop.remaining([&](size_t, size_t step) { vector_path(step); });
378 192 }
379
380 private:
381 21184 uint16x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_,
382 uint16x8_t v_src_element_stride_,
383 uint16x8_t v_width_,
384 uint16x8_t v_height_,
385 uint16x8_t v_border_, uint16x8_t x,
386 uint16x8_t y) {
387 // Find whether coordinates are within the image dimensions.
388 // Negative coordinates are interpreted as large values due to the s16->u16
389 // reinterpretation.
390 42368 uint16x8_t in_range =
391 42368 vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
392 21184 vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
393
394 // Zero out-of-range coordinates.
395 21184 x = vandq_u16(in_range, x);
396 21184 y = vandq_u16(in_range, y);
397
398 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
399 42368 uint32x4_t indices_low =
400 42368 vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
401 21184 vget_low_u16(v_src_element_stride_));
402 42368 uint32x4_t indices_high =
403 21184 vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
404
405 // Read pixels from source
406 190656 uint8x8_t pixels = {
407 21184 src_rows_[vgetq_lane_u32(indices_low, 0)],
408 21184 src_rows_[vgetq_lane_u32(indices_low, 1)],
409 21184 src_rows_[vgetq_lane_u32(indices_low, 2)],
410 21184 src_rows_[vgetq_lane_u32(indices_low, 3)],
411 21184 src_rows_[vgetq_lane_u32(indices_high, 0)],
412 21184 src_rows_[vgetq_lane_u32(indices_high, 1)],
413 21184 src_rows_[vgetq_lane_u32(indices_high, 2)],
414 21184 src_rows_[vgetq_lane_u32(indices_high, 3)],
415 };
416 // Select between source pixels and border color
417 42368 return vbslq_u16(in_range, vmovl_u8(pixels), v_border_);
418 21184 }
419
420 Rows<const ScalarType> src_rows_;
421 uint16x8_t v_src_stride_;
422 uint16x8_t v_width_;
423 uint16x8_t v_height_;
424 uint16x8_t v_border_;
425 }; // end of class RemapS16Point5ConstantBorder<uint8_t>
426
427 template <>
428 class RemapS16Point5ConstantBorder<uint16_t> {
429 public:
430 using ScalarType = uint16_t;
431 using MapVecTraits = neon::VecTraits<int16_t>;
432
433 168 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
434 size_t src_width, size_t src_height,
435 const ScalarType *border_value)
436 168 : src_rows_{src_rows},
437 336 v_src_element_stride_{vdupq_n_u16(
438 168 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
439 168 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
440 168 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
441 168 v_border_{vdupq_n_u16(*border_value)},
442 168 xfrac_{vdupq_n_u16(0)},
443 168 yfrac_{vdupq_n_u16(0)},
444 168 nxfrac_{vdupq_n_u16(0)},
445 168 nyfrac_{vdupq_n_u16(0)},
446 168 x0_{vdupq_n_s16(0)},
447 168 x1_{vdupq_n_s16(0)},
448 168 y0_{vdupq_n_s16(0)},
449 168 y1_{vdupq_n_s16(0)} {}
450
451 4996 void prepare_maps(Columns<const int16_t> mapxy,
452 Columns<const uint16_t> mapfrac) {
453 4996 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
454 4996 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
455 4996 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
456 4996 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
457 4996 xfrac_ = vandq_u16(frac, frac_mask);
458 4996 yfrac_ = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
459 4996 nxfrac_ = vsubq_u16(frac_max, xfrac_);
460 4996 nyfrac_ = vsubq_u16(frac_max, yfrac_);
461
462 4996 uint16x8_t one = vdupq_n_u16(1);
463 4996 x0_ = xy.val[0];
464 4996 y0_ = xy.val[1];
465 4996 x1_ = vaddq_u16(x0_, one);
466 4996 y1_ = vaddq_u16(y0_, one);
467 4996 }
468
469 192 void process_row(size_t width, Columns<const int16_t> mapxy,
470 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
471 5188 auto vector_path = [&](size_t step) {
472 4996 prepare_maps(mapxy, mapfrac);
473 4996 transform_pixels(dst);
474
475 4996 mapxy += ptrdiff_t(step);
476 4996 mapfrac += ptrdiff_t(step);
477 4996 dst += ptrdiff_t(step);
478 4996 };
479 192 LoopUnroll loop{width, MapVecTraits::num_lanes()};
480 192 loop.unroll_once(vector_path);
481 384 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
482 192 static_cast<ptrdiff_t>(loop.remaining_length());
483 192 mapxy -= back_step;
484 192 mapfrac -= back_step;
485 192 dst -= back_step;
486 304 loop.remaining([&](size_t, size_t step) { vector_path(step); });
487 192 }
488
489 4996 void transform_pixels(Columns<uint16_t> dst) {
490 4996 uint16x8_t a = load_pixels(x0_, y0_);
491 4996 uint16x8_t b = load_pixels(x1_, y0_);
492 4996 uint16x8_t c = load_pixels(x0_, y1_);
493 4996 uint16x8_t d = load_pixels(x1_, y1_);
494
495 9992 uint16x8_t result =
496 4996 interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
497
498 4996 vst1q_u16(&dst[0], result);
499 4996 }
500
501 19984 uint16x8_t load_pixels(uint16x8_t x, uint16x8_t y) {
502 // Find whether coordinates are within the image dimensions.
503 // Negative coordinates are interpreted as large values due to the s16->u16
504 // reinterpretation.
505 39968 uint16x8_t in_range =
506 39968 vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
507 19984 vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
508
509 // Zero out-of-range coordinates.
510 19984 x = vandq_u16(in_range, x);
511 19984 y = vandq_u16(in_range, y);
512
513 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
514 39968 uint32x4_t indices_low =
515 39968 vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
516 19984 vget_low_u16(v_src_element_stride_));
517 39968 uint32x4_t indices_high =
518 19984 vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
519
520 // Read pixels from source
521 179856 uint16x8_t pixels = {
522 19984 src_rows_[vgetq_lane_u32(indices_low, 0)],
523 19984 src_rows_[vgetq_lane_u32(indices_low, 1)],
524 19984 src_rows_[vgetq_lane_u32(indices_low, 2)],
525 19984 src_rows_[vgetq_lane_u32(indices_low, 3)],
526 19984 src_rows_[vgetq_lane_u32(indices_high, 0)],
527 19984 src_rows_[vgetq_lane_u32(indices_high, 1)],
528 19984 src_rows_[vgetq_lane_u32(indices_high, 2)],
529 19984 src_rows_[vgetq_lane_u32(indices_high, 3)],
530 };
531 // Select between source pixels and border color
532 39968 return vbslq_u16(in_range, pixels, v_border_);
533 19984 }
534
535 private:
536 Rows<const ScalarType> src_rows_;
537 uint16x8_t v_src_element_stride_;
538 uint16x8_t v_width_;
539 uint16x8_t v_height_;
540 uint16x8_t v_border_;
541 uint16x8_t xfrac_;
542 uint16x8_t yfrac_;
543 uint16x8_t nxfrac_;
544 uint16x8_t nyfrac_;
545 int16x8_t x0_;
546 int16x8_t x1_;
547 int16x8_t y0_;
548 int16x8_t y1_;
549 }; // end of class RemapS16Point5ConstantBorder<uint16_t>
550
551 20584 inline void get_coordinates(Columns<const int16_t> mapxy,
552 Columns<const uint16_t> mapfrac, uint16x8_t &x,
553 uint16x8_t &y, uint16x8_t &xfrac,
554 uint16x8_t &yfrac) {
555 20584 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
556 20584 x = xy.val[0];
557 20584 y = xy.val[1];
558
559 20584 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
560 20584 xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
561 41168 yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
562 20584 vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
563 20584 }
564
565 41168 inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1,
566 uint16x4_t y1, uint32x4_t &offsets_a,
567 uint32x4_t &offsets_b, uint32x4_t &offsets_c,
568 uint32x4_t &offsets_d,
569 uint16x4_t v_src_element_stride) {
570 // Multiply by 4 because of channels
571 41168 uint32x4_t x0_scaled = vshll_n_u16(x0, 2);
572 41168 uint32x4_t x1_scaled = vshll_n_u16(x1, 2);
573
574 // Calculate offsets from coordinates (y * element_stride + x)
575 // a: top left, b: top right, c: bottom left, d: bottom right
576 41168 offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride);
577 41168 offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride);
578 41168 offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride);
579 41168 offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride);
580 41168 }
581
582 inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low,
583 uint8_t frac_high) {
584 uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low,
585 frac_high, frac_high, frac_high, frac_high};
586 return vmovl_u8(frac_low_high);
587 }
588
589 338944 inline uint64_t load_32bit(const uint8_t *src) {
590 338944 uint32_t value = 0;
591 338944 memcpy(&value, src, sizeof(uint32_t));
592 677888 return static_cast<uint64_t>(value);
593 338944 }
594
595 84736 inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows,
596 uint32x4_t offsets) {
597 169472 uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) |
598 84736 (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32);
599 169472 uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) |
600 84736 (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32);
601 169472 return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23));
602 84736 }
603
604 10592 inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) {
605 using ScalarType = uint8_t;
606 10592 neon::VecTraits<ScalarType>::store(res, &dst[0]);
607 10592 }
608
609 159872 inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows,
610 uint32x2_t offsets) {
611 319744 return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]),
612 159872 vld1_u16(&src_rows[vget_lane_u32(offsets, 1)]));
613 }
614
615 9992 inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) {
616 using ScalarType = uint16_t;
617 9992 neon::VecTraits<ScalarType>::store(res, &dst[0]);
618 9992 }
619
620 // Replicate border specific functions
621 10292 inline void get_coordinates_replicate(Columns<const int16_t> mapxy,
622 Columns<const uint16_t> mapfrac,
623 uint16x8_t &x0, uint16x8_t &y0,
624 uint16x8_t &x1, uint16x8_t &y1,
625 uint16x8_t &xfrac, uint16x8_t &yfrac,
626 int16x8_t v_xmax, int16x8_t v_ymax) {
627 10292 get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
628
629 // Zero the xfrac (or yfrac) if x (or y) are below zero
630 10292 xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac);
631 10292 yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac);
632
633 // Clamp coordinates to within the dimensions of the source image
634 10292 x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax)));
635 10292 y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax)));
636
637 // x1 = x0 + 1, except if it's already xmax
638 10292 x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax));
639 10292 y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax));
640 10292 }
641
642 10592 inline void load_pixels_u8_4ch_replicate(
643 Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
644 uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b,
645 uint8x16_t &c, uint8x16_t &d) {
646 10592 a = load_4px_4ch(src_rows, offsets_a);
647 10592 b = load_4px_4ch(src_rows, offsets_b);
648 10592 c = load_4px_4ch(src_rows, offsets_c);
649 10592 d = load_4px_4ch(src_rows, offsets_d);
650 10592 }
651
652 9992 inline void load_pixels_u16_4ch_replicate(
653 Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
654 uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo,
655 uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo,
656 uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) {
657 9992 a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
658 9992 b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
659 9992 c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
660 9992 d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
661
662 9992 a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
663 9992 b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
664 9992 c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
665 9992 d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
666 9992 }
667
668 template <typename ScalarType>
669 class RemapS16Point5Replicate4ch;
670
671 template <>
672 class RemapS16Point5Replicate4ch<uint8_t> {
673 public:
674 using ScalarType = uint8_t;
675 using MapVecTraits = neon::VecTraits<int16_t>;
676
677 168 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
678 size_t src_height)
679 168 : src_rows_{src_rows},
680 168 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
681 168 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
682 168 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
683
684 192 void process_row(size_t width, Columns<const int16_t> mapxy,
685 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
686 5488 auto vector_path = [&](size_t step) {
687 5296 uint16x8_t x0, y0, x1, y1;
688 5296 uint16x8_t xfrac, yfrac;
689 10592 get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
690 5296 v_xmax_, v_ymax_);
691
692 5296 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
693 5296 uint8x16_t a, b, c, d;
694 5296 uint8x16x2_t res;
695
696 10592 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
697 5296 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
698 5296 offsets_d, v_src_stride_);
699 10592 load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
700 5296 offsets_d, a, b, c, d);
701
702 // Doubled fractions 001122..., low part
703 5296 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
704 5296 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
705 10592 uint16x8_t nxfrac2 =
706 5296 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
707 10592 uint16x8_t nyfrac2 =
708 5296 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
709 // Quadrupled fractions (00001111) are passed to interpolate
710 10592 uint16x8_t res0 = interpolate(
711 5296 vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
712 5296 vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
713 5296 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
714 10592 uint16x8_t res1 = interpolate(
715 5296 vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
716 5296 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
717 5296 vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
718 5296 res.val[0] =
719 5296 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
720
721 10592 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
722 5296 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
723 5296 offsets_d, v_src_stride_);
724 10592 load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
725 5296 offsets_d, a, b, c, d);
726 // Doubled fractions 001122..., high part
727 5296 xfrac2 = vzip2q(xfrac, xfrac);
728 5296 yfrac2 = vzip2q(yfrac, yfrac);
729 5296 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
730 5296 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
731 // Quadrupled fractions (00001111) are passed to interpolate
732 10592 res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
733 5296 vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
734 5296 vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
735 5296 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
736 10592 res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
737 5296 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
738 5296 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
739 5296 vzip2q(nyfrac2, nyfrac2));
740 5296 res.val[1] =
741 5296 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
742
743 5296 store_pixels_u8_4ch(res, dst);
744 5296 mapxy += ptrdiff_t(step);
745 5296 mapfrac += ptrdiff_t(step);
746 5296 dst += ptrdiff_t(step);
747 5296 };
748
749 192 LoopUnroll loop{width, MapVecTraits::num_lanes()};
750 192 loop.unroll_once(vector_path);
751 384 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
752 192 static_cast<ptrdiff_t>(loop.remaining_length());
753 192 mapxy -= back_step;
754 192 mapfrac -= back_step;
755 192 dst -= back_step;
756 304 loop.remaining([&](size_t, size_t step) { vector_path(step); });
757 192 }
758
759 private:
760 Rows<const ScalarType> src_rows_;
761 uint16x4_t v_src_stride_;
762 int16x8_t v_xmax_;
763 int16x8_t v_ymax_;
764 }; // end of class RemapS16Point5Replicate4ch<uint8_t>
765
766 template <>
767 class RemapS16Point5Replicate4ch<uint16_t> {
768 public:
769 using ScalarType = uint16_t;
770 using MapVecTraits = neon::VecTraits<int16_t>;
771
772 168 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
773 size_t src_height)
774 168 : src_rows_{src_rows},
775 336 v_src_element_stride_{vdup_n_u16(
776 168 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
777 168 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
778 168 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
779
780 192 void process_row(size_t width, Columns<const int16_t> mapxy,
781 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
782 5188 auto vector_path = [&](size_t step) {
783 4996 uint16x8_t x0, y0, x1, y1;
784 4996 uint16x8_t xfrac, yfrac;
785 9992 get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
786 4996 v_xmax_, v_ymax_);
787
788 4996 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
789 4996 uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
790 4996 uint16x8x4_t res;
791 9992 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
792 4996 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
793 4996 offsets_d, v_src_element_stride_);
794 9992 load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
795 4996 offsets_d, a_low, a_high, b_low, b_high,
796 c_low, c_high, d_low, d_high);
797
798 // Doubled fractions 001122..., low part
799 4996 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
800 4996 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
801 9992 uint16x8_t nxfrac2 =
802 4996 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
803 9992 uint16x8_t nyfrac2 =
804 4996 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
805 // Quadrupled fractions (00001111) are passed to interpolate
806 4996 res.val[0] =
807 9992 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
808 4996 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
809 4996 vzip1q(nyfrac2, nyfrac2));
810 4996 res.val[1] =
811 9992 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
812 4996 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
813 4996 vzip2q(nyfrac2, nyfrac2));
814
815 9992 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
816 4996 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
817 4996 offsets_d, v_src_element_stride_);
818 9992 load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
819 4996 offsets_d, a_low, a_high, b_low, b_high,
820 c_low, c_high, d_low, d_high);
821 // Doubled fractions 001122..., high part
822 4996 xfrac2 = vzip2q(xfrac, xfrac);
823 4996 yfrac2 = vzip2q(yfrac, yfrac);
824 4996 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
825 4996 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
826 // Quadrupled fractions (00001111) are passed to interpolate
827 4996 res.val[2] =
828 9992 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
829 4996 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
830 4996 vzip1q(nyfrac2, nyfrac2));
831 4996 res.val[3] =
832 9992 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
833 4996 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
834 4996 vzip2q(nyfrac2, nyfrac2));
835
836 4996 store_pixels_u16_4ch(res, dst);
837 4996 mapxy += ptrdiff_t(step);
838 4996 mapfrac += ptrdiff_t(step);
839 4996 dst += ptrdiff_t(step);
840 4996 };
841
842 192 LoopUnroll loop{width, MapVecTraits::num_lanes()};
843 192 loop.unroll_once(vector_path);
844 384 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
845 192 static_cast<ptrdiff_t>(loop.remaining_length());
846 192 mapxy -= back_step;
847 192 mapfrac -= back_step;
848 192 dst -= back_step;
849 304 loop.remaining([&](size_t, size_t step) { vector_path(step); });
850 192 }
851
852 private:
853 Rows<const ScalarType> src_rows_;
854 uint16x4_t v_src_element_stride_;
855 int16x8_t v_xmax_;
856 int16x8_t v_ymax_;
857 }; // end of class RemapS16Point5Replicate4ch<uint16_t>
858
859 // Constant border specific functions
860 10292 inline void get_coordinates_constant(
861 Columns<const int16_t> mapxy, Columns<const uint16_t> mapfrac,
862 uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0,
863 uint16x8_t &x1, uint16x8_t &y1, uint16x8_t &xfrac, uint16x8_t &yfrac,
864 uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c,
865 uint16x8_t &in_range_d) {
866 10292 get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
867
868 10292 uint16x8_t one = vdupq_n_u16(1);
869 10292 x1 = vaddq_u16(x0, one);
870 10292 y1 = vaddq_u16(y0, one);
871
872 10292 uint16x8_t x0_in_range = vcltq_u16(x0, v_width);
873 10292 uint16x8_t y0_in_range = vcltq_u16(y0, v_height);
874 10292 uint16x8_t x1_in_range = vcltq_u16(x1, v_width);
875 10292 uint16x8_t y1_in_range = vcltq_u16(y1, v_height);
876
877 10292 in_range_a = vandq(x0_in_range, y0_in_range);
878 10292 in_range_b = vandq(x1_in_range, y0_in_range);
879 10292 in_range_c = vandq(x0_in_range, y1_in_range);
880 10292 in_range_d = vandq(x1_in_range, y1_in_range);
881 10292 }
882
883 82336 inline uint32x4_t zero_out_of_range_offsets(uint32x4_t in_range,
884 uint32x4_t offsets) {
885 82336 return vbslq_u32(in_range, offsets, vdupq_n_u32(0));
886 }
887
888 42368 inline uint8x16_t replace_pixel_with_border_u8_4ch(uint32x4_t in_range,
889 uint8x16_t pixels,
890 uint8x16_t v_border) {
891 42368 return vreinterpretq_u8_u32(
892 42368 vbslq_u32(in_range, vreinterpretq_u32_u8(pixels), v_border));
893 }
894
895 79936 inline uint16x8_t replace_pixel_with_border_u16_4ch(uint64x2_t in_range,
896 uint16x8_t pixels,
897 uint16x8_t v_border) {
898 79936 return vreinterpretq_u16_u64(
899 79936 vbslq_u64(in_range, vreinterpretq_u64_u16(pixels), v_border));
900 }
901
902 10592 inline void load_pixels_u8_4ch_constant(
903 Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
904 uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
905 uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
906 uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c,
907 uint8x16_t &d) {
908 10592 offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
909 10592 offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
910 10592 offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
911 10592 offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
912
913 10592 a = load_4px_4ch(src_rows, offsets_a);
914 10592 b = load_4px_4ch(src_rows, offsets_b);
915 10592 c = load_4px_4ch(src_rows, offsets_c);
916 10592 d = load_4px_4ch(src_rows, offsets_d);
917
918 10592 a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border);
919 10592 b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border);
920 10592 c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border);
921 10592 d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border);
922 10592 }
923
924 9992 inline void load_pixels_u16_4ch_constant(
925 Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
926 uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
927 uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
928 uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo,
929 uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo,
930 uint16x8_t &d_hi) {
931 9992 offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
932 9992 offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
933 9992 offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
934 9992 offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
935
936 9992 a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
937 9992 b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
938 9992 c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
939 9992 d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
940
941 // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
942 49960 auto low32_to_u64 = [](uint32x4_t bitset) {
943 39968 return vreinterpretq_u64_s64(
944 39968 vmovl_s32(vreinterpret_s32_u32(vget_low_u32(bitset))));
945 };
946
947 19984 a_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_a), a_lo,
948 9992 v_border);
949 19984 b_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_b), b_lo,
950 9992 v_border);
951 19984 c_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_c), c_lo,
952 9992 v_border);
953 19984 d_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_d), d_lo,
954 9992 v_border);
955
956 9992 a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
957 9992 b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
958 9992 c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
959 9992 d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
960
961 // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
962 49960 auto hi32_to_u64 = [](uint32x4_t bitset) {
963 39968 return vreinterpretq_u64_s64(vmovl_high_s32(vreinterpretq_s32_u32(bitset)));
964 };
965
966 19984 a_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_a), a_hi,
967 9992 v_border);
968 19984 b_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_b), b_hi,
969 9992 v_border);
970 19984 c_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_c), c_hi,
971 9992 v_border);
972 19984 d_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_d), d_hi,
973 9992 v_border);
974 9992 }
975
976 // Convert bitsets such as in_range to 32bits, making all 1s or all 0s
977 41168 static uint32x4_t low16_to_s32(uint16x8_t bitset) {
978 41168 return vreinterpretq_u32_s32(
979 41168 vmovl_s16(vreinterpret_s16_u16(vget_low_u16(bitset))));
980 }
981
982 41168 static uint32x4_t hi16_to_s32(uint16x8_t bitset) {
983 41168 return vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(bitset)));
984 }
985
986 template <typename ScalarType>
987 class RemapS16Point5Constant4ch;
988
989 template <>
990 class RemapS16Point5Constant4ch<uint8_t> {
991 public:
992 using ScalarType = uint8_t;
993 using MapVecTraits = neon::VecTraits<int16_t>;
994
995 168 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
996 size_t src_height, const ScalarType *border_value)
997 168 : src_rows_{src_rows},
998 168 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
999 168 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
1000 168 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
1001 168 v_border_{} {
1002 168 uint32_t border_value_32{};
1003 168 memcpy(&border_value_32, border_value, sizeof(uint32_t));
1004 168 v_border_ = vreinterpretq_u8_u32(vdupq_n_u32(border_value_32));
1005 168 }
1006
1007 192 void process_row(size_t width, Columns<const int16_t> mapxy,
1008 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
1009 5488 auto vector_path = [&](size_t step) {
1010 5296 uint16x8_t x0, y0, x1, y1;
1011 5296 uint16x8_t xfrac, yfrac;
1012 5296 uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
1013 5296 get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
1014 y1, xfrac, yfrac, in_range_a, in_range_b,
1015 in_range_c, in_range_d);
1016
1017 5296 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
1018 5296 uint8x16_t a, b, c, d;
1019 5296 uint8x16x2_t res;
1020
1021 10592 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
1022 5296 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
1023 5296 offsets_d, v_src_stride_);
1024
1025 5296 load_pixels_u8_4ch_constant(
1026 5296 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1027 5296 low16_to_s32(in_range_a), low16_to_s32(in_range_b),
1028 5296 low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a, b,
1029 c, d);
1030
1031 // Doubled fractions 001122..., low part
1032 5296 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
1033 5296 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
1034 10592 uint16x8_t nxfrac2 =
1035 5296 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1036 10592 uint16x8_t nyfrac2 =
1037 5296 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1038 // Quadrupled fractions (00001111) are passed to interpolate
1039 10592 uint16x8_t res0 = interpolate(
1040 5296 vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
1041 5296 vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
1042 5296 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
1043 10592 uint16x8_t res1 = interpolate(
1044 5296 vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
1045 5296 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
1046 5296 vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
1047 5296 res.val[0] =
1048 5296 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
1049
1050 10592 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
1051 5296 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
1052 5296 offsets_d, v_src_stride_);
1053
1054 5296 load_pixels_u8_4ch_constant(
1055 5296 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1056 5296 hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
1057 5296 hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a, b, c,
1058 d);
1059 // Doubled fractions 001122..., high part
1060 5296 xfrac2 = vzip2q(xfrac, xfrac);
1061 5296 yfrac2 = vzip2q(yfrac, yfrac);
1062 5296 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1063 5296 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1064 // Quadrupled fractions (00001111) are passed to interpolate
1065 10592 res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
1066 5296 vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
1067 5296 vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
1068 5296 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
1069 10592 res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
1070 5296 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
1071 5296 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1072 5296 vzip2q(nyfrac2, nyfrac2));
1073 5296 res.val[1] =
1074 5296 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
1075
1076 5296 store_pixels_u8_4ch(res, dst);
1077 5296 mapxy += ptrdiff_t(step);
1078 5296 mapfrac += ptrdiff_t(step);
1079 5296 dst += ptrdiff_t(step);
1080 5296 };
1081
1082 192 LoopUnroll loop{width, MapVecTraits::num_lanes()};
1083 192 loop.unroll_once(vector_path);
1084 384 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
1085 192 static_cast<ptrdiff_t>(loop.remaining_length());
1086 192 mapxy -= back_step;
1087 192 mapfrac -= back_step;
1088 192 dst -= back_step;
1089 304 loop.remaining([&](size_t, size_t step) { vector_path(step); });
1090 192 }
1091
1092 private:
1093 Rows<const ScalarType> src_rows_;
1094 uint16x4_t v_src_stride_;
1095 uint16x8_t v_width_;
1096 uint16x8_t v_height_;
1097 uint8x16_t v_border_;
1098 }; // end of class RemapS16Point5Constant4ch<uint8_t>
1099
1100 template <>
1101 class RemapS16Point5Constant4ch<uint16_t> {
1102 public:
1103 using ScalarType = uint16_t;
1104 using MapVecTraits = neon::VecTraits<int16_t>;
1105
1106 168 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
1107 size_t src_height, const ScalarType *border_value)
1108 168 : src_rows_{src_rows},
1109 336 v_src_element_stride_{vdup_n_u16(
1110 168 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
1111 168 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
1112 168 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
1113 168 v_border_{} {
1114 168 uint64_t border_value_64{};
1115 168 memcpy(&border_value_64, border_value, sizeof(uint64_t));
1116 168 v_border_ = vreinterpretq_u16_u64(vdupq_n_u64(border_value_64));
1117 168 }
1118
1119 192 void process_row(size_t width, Columns<const int16_t> mapxy,
1120 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
1121 5188 auto vector_path = [&](size_t step) {
1122 4996 uint16x8_t x0, y0, x1, y1;
1123 4996 uint16x8_t xfrac, yfrac;
1124 4996 uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
1125 4996 get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
1126 y1, xfrac, yfrac, in_range_a, in_range_b,
1127 in_range_c, in_range_d);
1128
1129 4996 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
1130 4996 uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
1131 4996 uint16x8x4_t res;
1132
1133 9992 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
1134 4996 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
1135 4996 offsets_d, v_src_element_stride_);
1136
1137 4996 load_pixels_u16_4ch_constant(
1138 4996 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1139 4996 low16_to_s32(in_range_a), low16_to_s32(in_range_b),
1140 4996 low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a_low,
1141 a_high, b_low, b_high, c_low, c_high, d_low, d_high);
1142
1143 // Doubled fractions 001122..., low part
1144 4996 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
1145 4996 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
1146 9992 uint16x8_t nxfrac2 =
1147 4996 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1148 9992 uint16x8_t nyfrac2 =
1149 4996 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1150 // Quadrupled fractions (00001111) are passed to interpolate
1151 4996 res.val[0] =
1152 9992 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
1153 4996 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
1154 4996 vzip1q(nyfrac2, nyfrac2));
1155 4996 res.val[1] =
1156 9992 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
1157 4996 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1158 4996 vzip2q(nyfrac2, nyfrac2));
1159
1160 9992 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
1161 4996 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
1162 4996 offsets_d, v_src_element_stride_);
1163
1164 4996 load_pixels_u16_4ch_constant(
1165 4996 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1166 4996 hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
1167 4996 hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a_low,
1168 a_high, b_low, b_high, c_low, c_high, d_low, d_high);
1169
1170 // Doubled fractions 001122..., high part
1171 4996 xfrac2 = vzip2q(xfrac, xfrac);
1172 4996 yfrac2 = vzip2q(yfrac, yfrac);
1173 4996 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1174 4996 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1175 // Quadrupled fractions (00001111) are passed to interpolate
1176 4996 res.val[2] =
1177 9992 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
1178 4996 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
1179 4996 vzip1q(nyfrac2, nyfrac2));
1180 4996 res.val[3] =
1181 9992 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
1182 4996 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1183 4996 vzip2q(nyfrac2, nyfrac2));
1184
1185 4996 store_pixels_u16_4ch(res, dst);
1186 4996 mapxy += ptrdiff_t(step);
1187 4996 mapfrac += ptrdiff_t(step);
1188 4996 dst += ptrdiff_t(step);
1189 4996 };
1190
1191 192 LoopUnroll loop{width, MapVecTraits::num_lanes()};
1192 192 loop.unroll_once(vector_path);
1193 384 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
1194 192 static_cast<ptrdiff_t>(loop.remaining_length());
1195 192 mapxy -= back_step;
1196 192 mapfrac -= back_step;
1197 192 dst -= back_step;
1198 304 loop.remaining([&](size_t, size_t step) { vector_path(step); });
1199 192 }
1200
1201 private:
1202 Rows<const ScalarType> src_rows_;
1203 uint16x4_t v_src_element_stride_;
1204 uint16x8_t v_width_;
1205 uint16x8_t v_height_;
1206 uint16x8_t v_border_;
1207 }; // end of class RemapS16Point5Constant4ch<uint16_t>
1208
1209 // Most of the complexity comes from parameter checking.
1210 // NOLINTBEGIN(readability-function-cognitive-complexity)
1211 template <typename T>
1212 1428 kleidicv_error_t remap_s16point5(
1213 const T *src, size_t src_stride, size_t src_width, size_t src_height,
1214 T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1215 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
1216 const uint16_t *mapfrac, size_t mapfrac_stride,
1217 [[maybe_unused]] kleidicv_border_type_t border_type,
1218 [[maybe_unused]] const T *border_value) {
1219
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 712 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 712 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 712 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 712 times.
1428 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1220
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 710 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 710 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 710 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 710 times.
1424 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1221
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 708 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 708 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 708 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 708 times.
1420 CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
1222
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 706 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 706 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 706 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 706 times.
1416 CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height);
1223
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 704 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 700 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 700 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 704 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 700 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 700 times.
1412 CHECK_IMAGE_SIZE(src_width, src_height);
1224
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 698 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 696 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 696 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 698 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 696 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 696 times.
1400 CHECK_IMAGE_SIZE(dst_width, dst_height);
1225
8/8
✓ Branch 0 taken 340 times.
✓ Branch 1 taken 356 times.
✓ Branch 2 taken 338 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 340 times.
✓ Branch 5 taken 356 times.
✓ Branch 6 taken 338 times.
✓ Branch 7 taken 2 times.
1392 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
1226 4 return KLEIDICV_ERROR_NULL_POINTER;
1227 }
1228
1229
8/8
✓ Branch 0 taken 674 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 674 times.
✓ Branch 3 taken 20 times.
✓ Branch 4 taken 674 times.
✓ Branch 5 taken 20 times.
✓ Branch 6 taken 674 times.
✓ Branch 7 taken 20 times.
2776 if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height,
1230 1388 dst_width, border_type, channels)) {
1231 40 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1232 }
1233
1234 1348 Rows<const T> src_rows{src, src_stride, channels};
1235 1348 Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
1236 1348 Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1};
1237 1348 Rows<T> dst_rows{dst, dst_stride, channels};
1238 1348 Rectangle rect{dst_width, dst_height};
1239
4/4
✓ Branch 0 taken 338 times.
✓ Branch 1 taken 336 times.
✓ Branch 2 taken 338 times.
✓ Branch 3 taken 336 times.
1348 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
1240
4/4
✓ Branch 0 taken 168 times.
✓ Branch 1 taken 168 times.
✓ Branch 2 taken 168 times.
✓ Branch 3 taken 168 times.
672 if (channels == 1) {
1241 672 RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height,
1242 336 border_value};
1243 336 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1244 336 } else {
1245 assert(channels == 4);
1246 672 RemapS16Point5Constant4ch<T> operation{src_rows, src_width, src_height,
1247 336 border_value};
1248 336 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1249 336 }
1250 672 } else {
1251 assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
1252
4/4
✓ Branch 0 taken 170 times.
✓ Branch 1 taken 168 times.
✓ Branch 2 taken 170 times.
✓ Branch 3 taken 168 times.
676 if (channels == 1) {
1253 340 RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height};
1254 340 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1255 340 } else {
1256 assert(channels == 4);
1257 336 RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height};
1258 336 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1259 336 }
1260 }
1261 1348 return KLEIDICV_OK;
1262 1428 }
1263 // NOLINTEND(readability-function-cognitive-complexity)
1264
1265 #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \
1266 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \
1267 const type *src, size_t src_stride, size_t src_width, size_t src_height, \
1268 type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \
1269 size_t channels, const int16_t *mapxy, size_t mapxy_stride, \
1270 const uint16_t *mapfrac, size_t mapfrac_stride, \
1271 kleidicv_border_type_t border_type, const type *border_value)
1272
1273 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
1274 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
1275
1276 } // namespace kleidicv::neon
1277