KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/remap_s16point5_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 790 790 100.0%
Functions: 67 67 100.0%
Branches: 84 84 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6
7 #include "kleidicv/neon.h"
8 #include "kleidicv/transform/remap.h"
9
10 namespace kleidicv::neon {
11
12 template <typename ScalarType>
13 class RemapS16Point5Replicate;
14
15 template <>
16 class RemapS16Point5Replicate<uint8_t> {
17 public:
18 using ScalarType = uint8_t;
19 using MapVecTraits = neon::VecTraits<int16_t>;
20 using MapVectorType = typename MapVecTraits::VectorType;
21 using MapVector2Type = typename MapVecTraits::Vector2Type;
22 using FracVecTraits = neon::VecTraits<uint16_t>;
23 using FracVectorType = typename FracVecTraits::VectorType;
24
25 134 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
26 size_t src_height)
27 134 : src_rows_{src_rows},
28 134 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
29 134 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
30 134 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
31
32 158 void process_row(size_t width, Columns<const int16_t> mapxy,
33 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
34 5432 auto vector_path = [&](size_t step) {
35 5274 MapVector2Type xy = vld2q_s16(&mapxy[0]);
36 5274 FracVectorType frac = vld1q_u16(&mapfrac[0]);
37 10548 uint16x8_t xfrac =
38 10548 vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
39 // extract xfrac = frac[0:4]
40 5274 vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
41 10548 uint16x8_t yfrac =
42 10548 vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
43 // extract yfrac = frac[5:9]
44 10548 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
45 5274 vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
46 5274 uint16x8_t nxfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
47 5274 uint16x8_t nyfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
48
49 // Clamp coordinates to within the dimensions of the source image
50 10548 uint16x8_t x0 = vreinterpretq_u16_s16(
51 5274 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
52 10548 uint16x8_t y0 = vreinterpretq_u16_s16(
53 5274 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
54
55 // x1 = x0 + 1, except if it's already xmax
56 5274 uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_));
57 5274 uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_));
58
59 10548 uint16x4_t dst_low = load_and_interpolate(
60 5274 vmovl_u16(vget_low_u16(x0)), vget_low_u16(y0),
61 5274 vmovl_u16(vget_low_u16(x1)), vget_low_u16(y1), vget_low_u16(xfrac),
62 5274 vget_low_u16(yfrac), vget_low_u16(nxfrac), vget_low_u16(nyfrac));
63
64 10548 uint16x4_t dst_high = load_and_interpolate(
65 5274 vmovl_high_u16(x0), vget_high_u16(y0), vmovl_high_u16(x1),
66 5274 vget_high_u16(y1), vget_high_u16(xfrac), vget_high_u16(yfrac),
67 5274 vget_high_u16(nxfrac), vget_high_u16(nyfrac));
68
69 5274 vst1_u8(&dst[0], vuzp1_u8(dst_low, dst_high));
70 5274 mapxy += ptrdiff_t(step);
71 5274 mapfrac += ptrdiff_t(step);
72 5274 dst += ptrdiff_t(step);
73 5274 };
74 158 LoopUnroll loop{width, MapVecTraits::num_lanes()};
75 158 loop.unroll_once(vector_path);
76 316 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
77 158 static_cast<ptrdiff_t>(loop.remaining_length());
78 158 mapxy -= back_step;
79 158 mapfrac -= back_step;
80 158 dst -= back_step;
81 232 loop.remaining([&](size_t, size_t step) { vector_path(step); });
82 158 }
83
84 private:
85 10548 uint16x4_t load_and_interpolate(uint32x4_t x0, uint16x4_t y0, uint32x4_t x1,
86 uint16x4_t y1, uint16x4_t xfrac,
87 uint16x4_t yfrac, uint16x4_t nxfrac,
88 uint16x4_t nyfrac) {
89 // Calculate offsets from coordinates (y * stride + x)
90 // a: top left, b: top right, c: bottom left, d: bottom right
91 10548 uint32x4_t offset = vmlal_u16(x0, y0, v_src_stride_);
92 21096 uint64_t acc =
93 21096 static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
94 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
95 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
96 10548 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
97 10548 uint16x4_t a = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
98
99 10548 offset = vmlal_u16(x1, y0, v_src_stride_);
100
101 31644 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
102 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
103 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
104 10548 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
105 10548 uint16x4_t b = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
106
107 10548 uint16x4_t line0 = vmla_u16(vmul_u16(xfrac, b), nxfrac, a);
108
109 10548 offset = vmlal_u16(x0, y1, v_src_stride_);
110
111 31644 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
112 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
113 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
114 10548 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
115 10548 uint16x4_t c = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
116
117 21096 uint32x4_t line0_lerpd = vmlal_u16(
118 10548 vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, nyfrac);
119
120 10548 offset = vmlal_u16(x1, y1, v_src_stride_);
121
122 31644 acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
123 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) |
124 21096 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) |
125 10548 (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48);
126 10548 uint16x4_t d = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0));
127
128 10548 uint16x4_t line1 = vmla_u16(vmul_u16(xfrac, d), nxfrac, c);
129 21096 return vshrn_n_u32(vmlal_u16(line0_lerpd, line1, yfrac),
130 2 * REMAP16POINT5_FRAC_BITS);
131 10548 }
132
133 Rows<const ScalarType> src_rows_;
134 uint16x4_t v_src_stride_;
135 int16x8_t v_xmax_;
136 int16x8_t v_ymax_;
137 }; // end of class RemapS16Point5Replicate<uint8_t>
138
139 // Common interpolation function used by all RemapS16Point5 operations except
140 // 1-channel u8 with replicated borders (RemapS16Point5Replicate<uint8_t>)
141 // because that processes one half vector in one step
142 97170 static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c,
143 uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac,
144 uint16x8_t nxfrac, uint16x8_t nyfrac) {
145 485850 auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right,
146 uint16x4_t frac,
147 uint16x4_t nfrac) -> uint32x4_t {
148 388680 return vmlal_u16(vmull_u16(nfrac, left), frac, right);
149 };
150
151 291510 auto interpolate_horizontal_low = [interpolate_horizontal](
152 uint16x8_t left, uint16x8_t right,
153 uint16x8_t frac,
154 uint16x8_t nfrac) -> uint32x4_t {
155 388680 return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right),
156 194340 vget_low_u16(frac), vget_low_u16(nfrac));
157 };
158
159 291510 auto interpolate_horizontal_high = [interpolate_horizontal](
160 uint16x8_t left, uint16x8_t right,
161 uint16x8_t frac,
162 uint16x8_t nfrac) -> uint32x4_t {
163 388680 return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right),
164 194340 vget_high_u16(frac), vget_high_u16(nfrac));
165 };
166
167 // Offset pixel values by 0.5 before rounding down.
168 97170 const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
169
170 291510 auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac,
171 uint32x4_t nfrac) -> uint32x4_t {
172 194340 uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac);
173 388680 return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
174 194340 };
175
176 97170 uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac);
177 97170 uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac);
178 97170 uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac);
179 97170 uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac);
180
181 194340 uint32x4_t lo =
182 194340 interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)),
183 97170 vmovl_u16(vget_low_u16(nyfrac)));
184 194340 uint32x4_t hi = interpolate_vertical(
185 97170 line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac));
186
187 // Discard upper 16 bits of each element (low the precision back to original
188 // 16 bits)
189 194340 uint16x8_t result =
190 97170 vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
191 194340 return result;
192 97170 }
193
194 template <>
195 class RemapS16Point5Replicate<uint16_t> {
196 public:
197 using ScalarType = uint16_t;
198 using MapVecTraits = neon::VecTraits<int16_t>;
199
200 134 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
201 size_t src_height)
202 134 : src_rows_{src_rows},
203 268 v_src_element_stride_{vdupq_n_u16(
204 134 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
205 134 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
206 134 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))},
207 134 xfrac_{vdupq_n_u16(0)},
208 134 yfrac_{vdupq_n_u16(0)},
209 134 nxfrac_{vdupq_n_u16(0)},
210 134 nyfrac_{vdupq_n_u16(0)},
211 134 x0_{vdupq_n_s16(0)},
212 134 x1_{vdupq_n_s16(0)},
213 134 y0_{vdupq_n_s16(0)},
214 134 y1_{vdupq_n_s16(0)} {}
215
216 158 void process_row(size_t width, Columns<const int16_t> mapxy,
217 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
218 5132 auto vector_path = [&](size_t step) {
219 4974 prepare_maps(mapxy, mapfrac);
220 4974 transform_pixels(dst);
221
222 4974 mapxy += ptrdiff_t(step);
223 4974 mapfrac += ptrdiff_t(step);
224 4974 dst += ptrdiff_t(step);
225 4974 };
226 158 LoopUnroll loop{width, MapVecTraits::num_lanes()};
227 158 loop.unroll_once(vector_path);
228 316 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
229 158 static_cast<ptrdiff_t>(loop.remaining_length());
230 158 mapxy -= back_step;
231 158 mapfrac -= back_step;
232 158 dst -= back_step;
233 232 loop.remaining([&](size_t, size_t step) { vector_path(step); });
234 158 }
235
236 4974 void prepare_maps(Columns<const int16_t> mapxy,
237 Columns<const uint16_t> mapfrac) {
238 4974 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
239 4974 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
240 4974 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
241 4974 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
242 9948 xfrac_ = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
243 4974 vandq_u16(frac, frac_mask));
244 4974 yfrac_ = vbslq_u16(
245 4974 vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
246 4974 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask));
247 4974 nxfrac_ = vsubq_u16(frac_max, xfrac_);
248 4974 nyfrac_ = vsubq_u16(frac_max, yfrac_);
249
250 // Clamp coordinates to within the dimensions of the source image
251 4974 x0_ = vreinterpretq_u16_s16(
252 4974 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_)));
253 4974 y0_ = vreinterpretq_u16_s16(
254 4974 vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_)));
255
256 // x1 = x0 + 1, except if it's already xmax
257 4974 x1_ = vsubq_u16(x0_, vcltq_s16(xy.val[0], v_xmax_));
258 4974 y1_ = vsubq_u16(y0_, vcltq_s16(xy.val[1], v_ymax_));
259 4974 }
260
261 4974 void transform_pixels(Columns<uint16_t> dst) {
262 4974 uint16x8_t a = load_pixels(x0_, y0_);
263 4974 uint16x8_t b = load_pixels(x1_, y0_);
264 4974 uint16x8_t c = load_pixels(x0_, y1_);
265 4974 uint16x8_t d = load_pixels(x1_, y1_);
266
267 9948 uint16x8_t result =
268 4974 interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
269
270 4974 vst1q_u16(&dst[0], result);
271 4974 }
272
273 19896 uint16x8_t load_pixels(int16x8_t x, int16x8_t y) {
274 // Clamp coordinates to within the dimensions of the source image
275 39792 uint16x8_t x_clamped =
276 19896 vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(x, vdupq_n_s16(0))), v_xmax_);
277 39792 uint16x8_t y_clamped =
278 19896 vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y, vdupq_n_s16(0))), v_ymax_);
279
280 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
281 39792 uint32x4_t indices_low =
282 39792 vmlal_u16(vmovl_u16(vget_low_u16(x_clamped)), vget_low_u16(y_clamped),
283 19896 vget_low_u16(v_src_element_stride_));
284 39792 uint32x4_t indices_high = vmlal_high_u16(vmovl_high_u16(x_clamped),
285 19896 y_clamped, v_src_element_stride_);
286
287 // Read pixels from source
288 179064 uint16x8_t pixels = {
289 19896 src_rows_[vgetq_lane_u32(indices_low, 0)],
290 19896 src_rows_[vgetq_lane_u32(indices_low, 1)],
291 19896 src_rows_[vgetq_lane_u32(indices_low, 2)],
292 19896 src_rows_[vgetq_lane_u32(indices_low, 3)],
293 19896 src_rows_[vgetq_lane_u32(indices_high, 0)],
294 19896 src_rows_[vgetq_lane_u32(indices_high, 1)],
295 19896 src_rows_[vgetq_lane_u32(indices_high, 2)],
296 19896 src_rows_[vgetq_lane_u32(indices_high, 3)],
297 };
298
299 39792 return pixels;
300 19896 }
301
302 private:
303 Rows<const ScalarType> src_rows_;
304 uint16x8_t v_src_element_stride_;
305 int16x8_t v_xmax_;
306 int16x8_t v_ymax_;
307 uint16x8_t xfrac_;
308 uint16x8_t yfrac_;
309 uint16x8_t nxfrac_;
310 uint16x8_t nyfrac_;
311 int16x8_t x0_;
312 int16x8_t x1_;
313 int16x8_t y0_;
314 int16x8_t y1_;
315 }; // end of class RemapS16Point5Replicate<uint16_t>
316
317 template <typename ScalarType>
318 class RemapS16Point5ConstantBorder;
319
320 template <>
321 class RemapS16Point5ConstantBorder<uint8_t> {
322 public:
323 using ScalarType = uint8_t;
324 using MapVecTraits = neon::VecTraits<int16_t>;
325
326 132 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
327 size_t src_width, size_t src_height,
328 const ScalarType *border_value)
329 132 : src_rows_{src_rows},
330 132 v_src_stride_{vdupq_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
331 132 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
332 132 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
333 132 v_border_{vdupq_n_u16(static_cast<uint16_t>(*border_value))} {}
334
335 156 void process_row(size_t width, Columns<const int16_t> mapxy,
336 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
337 5428 auto vector_path = [&](size_t step) {
338 5272 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
339 5272 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
340 5272 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
341 5272 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
342 5272 uint16x8_t xfrac = vandq_u16(frac, frac_mask);
343 10544 uint16x8_t yfrac =
344 5272 vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
345 5272 uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac);
346 5272 uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac);
347
348 5272 uint16x8_t one = vdupq_n_u16(1);
349 5272 uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]);
350 5272 uint16x8_t y0 = vreinterpretq_u16_s16(xy.val[1]);
351 5272 uint16x8_t x1 = vaddq_u16(x0, one);
352 5272 uint16x8_t y1 = vaddq_u16(y0, one);
353
354 10544 uint16x8_t a = load_pixels_or_constant_border(
355 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0);
356 10544 uint16x8_t b = load_pixels_or_constant_border(
357 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0);
358 10544 uint16x8_t c = load_pixels_or_constant_border(
359 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1);
360 10544 uint16x8_t d = load_pixels_or_constant_border(
361 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1);
362
363 5272 uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac);
364
365 5272 vst1_u8(&dst[0], vqmovn_u16(result));
366 5272 mapxy += ptrdiff_t(step);
367 5272 mapfrac += ptrdiff_t(step);
368 5272 dst += ptrdiff_t(step);
369 5272 };
370 156 LoopUnroll loop{width, MapVecTraits::num_lanes()};
371 156 loop.unroll_once(vector_path);
372 312 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
373 156 static_cast<ptrdiff_t>(loop.remaining_length());
374 156 mapxy -= back_step;
375 156 mapfrac -= back_step;
376 156 dst -= back_step;
377 230 loop.remaining([&](size_t, size_t step) { vector_path(step); });
378 156 }
379
380 private:
381 21088 uint16x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_,
382 uint16x8_t v_src_element_stride_,
383 uint16x8_t v_width_,
384 uint16x8_t v_height_,
385 uint16x8_t v_border_, uint16x8_t x,
386 uint16x8_t y) {
387 // Find whether coordinates are within the image dimensions.
388 // Negative coordinates are interpreted as large values due to the s16->u16
389 // reinterpretation.
390 42176 uint16x8_t in_range =
391 42176 vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
392 21088 vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
393
394 // Zero out-of-range coordinates.
395 21088 x = vandq_u16(in_range, x);
396 21088 y = vandq_u16(in_range, y);
397
398 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
399 42176 uint32x4_t indices_low =
400 42176 vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
401 21088 vget_low_u16(v_src_element_stride_));
402 42176 uint32x4_t indices_high =
403 21088 vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
404
405 // Read pixels from source
406 189792 uint8x8_t pixels = {
407 21088 src_rows_[vgetq_lane_u32(indices_low, 0)],
408 21088 src_rows_[vgetq_lane_u32(indices_low, 1)],
409 21088 src_rows_[vgetq_lane_u32(indices_low, 2)],
410 21088 src_rows_[vgetq_lane_u32(indices_low, 3)],
411 21088 src_rows_[vgetq_lane_u32(indices_high, 0)],
412 21088 src_rows_[vgetq_lane_u32(indices_high, 1)],
413 21088 src_rows_[vgetq_lane_u32(indices_high, 2)],
414 21088 src_rows_[vgetq_lane_u32(indices_high, 3)],
415 };
416 // Select between source pixels and border colour
417 42176 return vbslq_u16(in_range, vmovl_u8(pixels), v_border_);
418 21088 }
419
420 Rows<const ScalarType> src_rows_;
421 uint16x8_t v_src_stride_;
422 uint16x8_t v_width_;
423 uint16x8_t v_height_;
424 uint16x8_t v_border_;
425 }; // end of class RemapS16Point5ConstantBorder<uint8_t>
426
427 template <>
428 class RemapS16Point5ConstantBorder<uint16_t> {
429 public:
430 using ScalarType = uint16_t;
431 using MapVecTraits = neon::VecTraits<int16_t>;
432
433 132 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
434 size_t src_width, size_t src_height,
435 const ScalarType *border_value)
436 132 : src_rows_{src_rows},
437 264 v_src_element_stride_{vdupq_n_u16(
438 132 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
439 132 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
440 132 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
441 132 v_border_{vdupq_n_u16(*border_value)},
442 132 xfrac_{vdupq_n_u16(0)},
443 132 yfrac_{vdupq_n_u16(0)},
444 132 nxfrac_{vdupq_n_u16(0)},
445 132 nyfrac_{vdupq_n_u16(0)},
446 132 x0_{vdupq_n_s16(0)},
447 132 x1_{vdupq_n_s16(0)},
448 132 y0_{vdupq_n_s16(0)},
449 132 y1_{vdupq_n_s16(0)} {}
450
451 4972 void prepare_maps(Columns<const int16_t> mapxy,
452 Columns<const uint16_t> mapfrac) {
453 4972 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
454 4972 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
455 4972 uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
456 4972 uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
457 4972 xfrac_ = vandq_u16(frac, frac_mask);
458 4972 yfrac_ = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
459 4972 nxfrac_ = vsubq_u16(frac_max, xfrac_);
460 4972 nyfrac_ = vsubq_u16(frac_max, yfrac_);
461
462 4972 uint16x8_t one = vdupq_n_u16(1);
463 4972 x0_ = xy.val[0];
464 4972 y0_ = xy.val[1];
465 4972 x1_ = vaddq_u16(x0_, one);
466 4972 y1_ = vaddq_u16(y0_, one);
467 4972 }
468
469 156 void process_row(size_t width, Columns<const int16_t> mapxy,
470 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
471 5128 auto vector_path = [&](size_t step) {
472 4972 prepare_maps(mapxy, mapfrac);
473 4972 transform_pixels(dst);
474
475 4972 mapxy += ptrdiff_t(step);
476 4972 mapfrac += ptrdiff_t(step);
477 4972 dst += ptrdiff_t(step);
478 4972 };
479 156 LoopUnroll loop{width, MapVecTraits::num_lanes()};
480 156 loop.unroll_once(vector_path);
481 312 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
482 156 static_cast<ptrdiff_t>(loop.remaining_length());
483 156 mapxy -= back_step;
484 156 mapfrac -= back_step;
485 156 dst -= back_step;
486 230 loop.remaining([&](size_t, size_t step) { vector_path(step); });
487 156 }
488
489 4972 void transform_pixels(Columns<uint16_t> dst) {
490 4972 uint16x8_t a = load_pixels(x0_, y0_);
491 4972 uint16x8_t b = load_pixels(x1_, y0_);
492 4972 uint16x8_t c = load_pixels(x0_, y1_);
493 4972 uint16x8_t d = load_pixels(x1_, y1_);
494
495 9944 uint16x8_t result =
496 4972 interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
497
498 4972 vst1q_u16(&dst[0], result);
499 4972 }
500
501 19888 uint16x8_t load_pixels(uint16x8_t x, uint16x8_t y) {
502 // Find whether coordinates are within the image dimensions.
503 // Negative coordinates are interpreted as large values due to the s16->u16
504 // reinterpretation.
505 39776 uint16x8_t in_range =
506 39776 vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_),
507 19888 vcltq_u16(vreinterpretq_u16_s16(y), v_height_));
508
509 // Zero out-of-range coordinates.
510 19888 x = vandq_u16(in_range, x);
511 19888 y = vandq_u16(in_range, y);
512
513 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
514 39776 uint32x4_t indices_low =
515 39776 vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y),
516 19888 vget_low_u16(v_src_element_stride_));
517 39776 uint32x4_t indices_high =
518 19888 vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_);
519
520 // Read pixels from source
521 178992 uint16x8_t pixels = {
522 19888 src_rows_[vgetq_lane_u32(indices_low, 0)],
523 19888 src_rows_[vgetq_lane_u32(indices_low, 1)],
524 19888 src_rows_[vgetq_lane_u32(indices_low, 2)],
525 19888 src_rows_[vgetq_lane_u32(indices_low, 3)],
526 19888 src_rows_[vgetq_lane_u32(indices_high, 0)],
527 19888 src_rows_[vgetq_lane_u32(indices_high, 1)],
528 19888 src_rows_[vgetq_lane_u32(indices_high, 2)],
529 19888 src_rows_[vgetq_lane_u32(indices_high, 3)],
530 };
531 // Select between source pixels and border colour
532 39776 return vbslq_u16(in_range, pixels, v_border_);
533 19888 }
534
535 private:
536 Rows<const ScalarType> src_rows_;
537 uint16x8_t v_src_element_stride_;
538 uint16x8_t v_width_;
539 uint16x8_t v_height_;
540 uint16x8_t v_border_;
541 uint16x8_t xfrac_;
542 uint16x8_t yfrac_;
543 uint16x8_t nxfrac_;
544 uint16x8_t nyfrac_;
545 int16x8_t x0_;
546 int16x8_t x1_;
547 int16x8_t y0_;
548 int16x8_t y1_;
549 }; // end of class RemapS16Point5ConstantBorder<uint16_t>
550
551 20488 inline void get_coordinates(Columns<const int16_t> mapxy,
552 Columns<const uint16_t> mapfrac, uint16x8_t &x,
553 uint16x8_t &y, uint16x8_t &xfrac,
554 uint16x8_t &yfrac) {
555 20488 int16x8x2_t xy = vld2q_s16(&mapxy[0]);
556 20488 x = xy.val[0];
557 20488 y = xy.val[1];
558
559 20488 uint16x8_t frac = vld1q_u16(&mapfrac[0]);
560 20488 xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
561 40976 yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
562 20488 vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
563 20488 }
564
565 40976 inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1,
566 uint16x4_t y1, uint32x4_t &offsets_a,
567 uint32x4_t &offsets_b, uint32x4_t &offsets_c,
568 uint32x4_t &offsets_d,
569 uint16x4_t v_src_element_stride) {
570 // Multiply by 4 because of channels
571 40976 uint32x4_t x0_scaled = vshll_n_u16(x0, 2);
572 40976 uint32x4_t x1_scaled = vshll_n_u16(x1, 2);
573
574 // Calculate offsets from coordinates (y * element_stride + x)
575 // a: top left, b: top right, c: bottom left, d: bottom right
576 40976 offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride);
577 40976 offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride);
578 40976 offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride);
579 40976 offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride);
580 40976 }
581
582 inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low,
583 uint8_t frac_high) {
584 uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low,
585 frac_high, frac_high, frac_high, frac_high};
586 return vmovl_u8(frac_low_high);
587 }
588
589 337408 inline uint64_t load_32bit(const uint8_t *src) {
590 337408 uint32_t value = 0;
591 337408 memcpy(&value, src, sizeof(uint32_t));
592 674816 return static_cast<uint64_t>(value);
593 337408 }
594
595 84352 inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows,
596 uint32x4_t offsets) {
597 168704 uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) |
598 84352 (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32);
599 168704 uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) |
600 84352 (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32);
601 168704 return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23));
602 84352 }
603
604 10544 inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) {
605 using ScalarType = uint8_t;
606 10544 neon::VecTraits<ScalarType>::store(res, &dst[0]);
607 10544 }
608
609 159104 inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows,
610 uint32x2_t offsets) {
611 318208 return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]),
612 159104 vld1_u16(&src_rows[vget_lane_u32(offsets, 1)]));
613 }
614
615 9944 inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) {
616 using ScalarType = uint16_t;
617 9944 neon::VecTraits<ScalarType>::store(res, &dst[0]);
618 9944 }
619
620 // Replicate border specific functions
621 10244 inline void get_coordinates_replicate(Columns<const int16_t> mapxy,
622 Columns<const uint16_t> mapfrac,
623 uint16x8_t &x0, uint16x8_t &y0,
624 uint16x8_t &x1, uint16x8_t &y1,
625 uint16x8_t &xfrac, uint16x8_t &yfrac,
626 int16x8_t v_xmax, int16x8_t v_ymax) {
627 10244 get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
628
629 // Zero the xfrac (or yfrac) if x (or y) are below zero
630 10244 xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac);
631 10244 yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac);
632
633 // Clamp coordinates to within the dimensions of the source image
634 10244 x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax)));
635 10244 y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax)));
636
637 // x1 = x0 + 1, except if it's already xmax
638 10244 x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax));
639 10244 y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax));
640 10244 }
641
642 10544 inline void load_pixels_u8_4ch_replicate(
643 Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
644 uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b,
645 uint8x16_t &c, uint8x16_t &d) {
646 10544 a = load_4px_4ch(src_rows, offsets_a);
647 10544 b = load_4px_4ch(src_rows, offsets_b);
648 10544 c = load_4px_4ch(src_rows, offsets_c);
649 10544 d = load_4px_4ch(src_rows, offsets_d);
650 10544 }
651
652 9944 inline void load_pixels_u16_4ch_replicate(
653 Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
654 uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo,
655 uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo,
656 uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) {
657 9944 a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
658 9944 b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
659 9944 c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
660 9944 d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
661
662 9944 a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
663 9944 b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
664 9944 c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
665 9944 d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
666 9944 }
667
668 template <typename ScalarType>
669 class RemapS16Point5Replicate4ch;
670
671 template <>
672 class RemapS16Point5Replicate4ch<uint8_t> {
673 public:
674 using ScalarType = uint8_t;
675 using MapVecTraits = neon::VecTraits<int16_t>;
676
677 132 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
678 size_t src_height)
679 132 : src_rows_{src_rows},
680 132 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
681 132 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
682 132 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
683
684 156 void process_row(size_t width, Columns<const int16_t> mapxy,
685 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
686 5428 auto vector_path = [&](size_t step) {
687 5272 uint16x8_t x0, y0, x1, y1;
688 5272 uint16x8_t xfrac, yfrac;
689 10544 get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
690 5272 v_xmax_, v_ymax_);
691
692 5272 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
693 5272 uint8x16_t a, b, c, d;
694 5272 uint8x16x2_t res;
695
696 10544 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
697 5272 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
698 5272 offsets_d, v_src_stride_);
699 10544 load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
700 5272 offsets_d, a, b, c, d);
701
702 // Doubled fractions 001122..., low part
703 5272 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
704 5272 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
705 10544 uint16x8_t nxfrac2 =
706 5272 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
707 10544 uint16x8_t nyfrac2 =
708 5272 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
709 // Quadrupled fractions (00001111) are passed to interpolate
710 10544 uint16x8_t res0 = interpolate(
711 5272 vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
712 5272 vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
713 5272 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
714 10544 uint16x8_t res1 = interpolate(
715 5272 vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
716 5272 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
717 5272 vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
718 5272 res.val[0] =
719 5272 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
720
721 10544 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
722 5272 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
723 5272 offsets_d, v_src_stride_);
724 10544 load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
725 5272 offsets_d, a, b, c, d);
726 // Doubled fractions 001122..., high part
727 5272 xfrac2 = vzip2q(xfrac, xfrac);
728 5272 yfrac2 = vzip2q(yfrac, yfrac);
729 5272 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
730 5272 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
731 // Quadrupled fractions (00001111) are passed to interpolate
732 10544 res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
733 5272 vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
734 5272 vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
735 5272 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
736 10544 res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
737 5272 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
738 5272 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
739 5272 vzip2q(nyfrac2, nyfrac2));
740 5272 res.val[1] =
741 5272 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
742
743 5272 store_pixels_u8_4ch(res, dst);
744 5272 mapxy += ptrdiff_t(step);
745 5272 mapfrac += ptrdiff_t(step);
746 5272 dst += ptrdiff_t(step);
747 5272 };
748
749 156 LoopUnroll loop{width, MapVecTraits::num_lanes()};
750 156 loop.unroll_once(vector_path);
751 312 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
752 156 static_cast<ptrdiff_t>(loop.remaining_length());
753 156 mapxy -= back_step;
754 156 mapfrac -= back_step;
755 156 dst -= back_step;
756 230 loop.remaining([&](size_t, size_t step) { vector_path(step); });
757 156 }
758
759 private:
760 Rows<const ScalarType> src_rows_;
761 uint16x4_t v_src_stride_;
762 int16x8_t v_xmax_;
763 int16x8_t v_ymax_;
764 }; // end of class RemapS16Point5Replicate4ch<uint8_t>
765
766 template <>
767 class RemapS16Point5Replicate4ch<uint16_t> {
768 public:
769 using ScalarType = uint16_t;
770 using MapVecTraits = neon::VecTraits<int16_t>;
771
772 132 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
773 size_t src_height)
774 132 : src_rows_{src_rows},
775 264 v_src_element_stride_{vdup_n_u16(
776 132 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
777 132 v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
778 132 v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
779
780 156 void process_row(size_t width, Columns<const int16_t> mapxy,
781 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
782 5128 auto vector_path = [&](size_t step) {
783 4972 uint16x8_t x0, y0, x1, y1;
784 4972 uint16x8_t xfrac, yfrac;
785 9944 get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
786 4972 v_xmax_, v_ymax_);
787
788 4972 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
789 4972 uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
790 4972 uint16x8x4_t res;
791 9944 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
792 4972 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
793 4972 offsets_d, v_src_element_stride_);
794 9944 load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
795 4972 offsets_d, a_low, a_high, b_low, b_high,
796 c_low, c_high, d_low, d_high);
797
798 // Doubled fractions 001122..., low part
799 4972 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
800 4972 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
801 9944 uint16x8_t nxfrac2 =
802 4972 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
803 9944 uint16x8_t nyfrac2 =
804 4972 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
805 // Quadrupled fractions (00001111) are passed to interpolate
806 4972 res.val[0] =
807 9944 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
808 4972 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
809 4972 vzip1q(nyfrac2, nyfrac2));
810 4972 res.val[1] =
811 9944 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
812 4972 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
813 4972 vzip2q(nyfrac2, nyfrac2));
814
815 9944 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
816 4972 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
817 4972 offsets_d, v_src_element_stride_);
818 9944 load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
819 4972 offsets_d, a_low, a_high, b_low, b_high,
820 c_low, c_high, d_low, d_high);
821 // Doubled fractions 001122..., high part
822 4972 xfrac2 = vzip2q(xfrac, xfrac);
823 4972 yfrac2 = vzip2q(yfrac, yfrac);
824 4972 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
825 4972 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
826 // Quadrupled fractions (00001111) are passed to interpolate
827 4972 res.val[2] =
828 9944 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
829 4972 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
830 4972 vzip1q(nyfrac2, nyfrac2));
831 4972 res.val[3] =
832 9944 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
833 4972 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
834 4972 vzip2q(nyfrac2, nyfrac2));
835
836 4972 store_pixels_u16_4ch(res, dst);
837 4972 mapxy += ptrdiff_t(step);
838 4972 mapfrac += ptrdiff_t(step);
839 4972 dst += ptrdiff_t(step);
840 4972 };
841
842 156 LoopUnroll loop{width, MapVecTraits::num_lanes()};
843 156 loop.unroll_once(vector_path);
844 312 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
845 156 static_cast<ptrdiff_t>(loop.remaining_length());
846 156 mapxy -= back_step;
847 156 mapfrac -= back_step;
848 156 dst -= back_step;
849 230 loop.remaining([&](size_t, size_t step) { vector_path(step); });
850 156 }
851
852 private:
853 Rows<const ScalarType> src_rows_;
854 uint16x4_t v_src_element_stride_;
855 int16x8_t v_xmax_;
856 int16x8_t v_ymax_;
857 }; // end of class RemapS16Point5Replicate4ch<uint16_t>
858
859 // Constant border specific functions
860 10244 inline void get_coordinates_constant(
861 Columns<const int16_t> mapxy, Columns<const uint16_t> mapfrac,
862 uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0,
863 uint16x8_t &x1, uint16x8_t &y1, uint16x8_t &xfrac, uint16x8_t &yfrac,
864 uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c,
865 uint16x8_t &in_range_d) {
866 10244 get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
867
868 10244 uint16x8_t one = vdupq_n_u16(1);
869 10244 x1 = vaddq_u16(x0, one);
870 10244 y1 = vaddq_u16(y0, one);
871
872 10244 uint16x8_t x0_in_range = vcltq_u16(x0, v_width);
873 10244 uint16x8_t y0_in_range = vcltq_u16(y0, v_height);
874 10244 uint16x8_t x1_in_range = vcltq_u16(x1, v_width);
875 10244 uint16x8_t y1_in_range = vcltq_u16(y1, v_height);
876
877 10244 in_range_a = vandq(x0_in_range, y0_in_range);
878 10244 in_range_b = vandq(x1_in_range, y0_in_range);
879 10244 in_range_c = vandq(x0_in_range, y1_in_range);
880 10244 in_range_d = vandq(x1_in_range, y1_in_range);
881 10244 }
882
883 81952 inline uint32x4_t zero_out_of_range_offsets(uint32x4_t in_range,
884 uint32x4_t offsets) {
885 81952 return vbslq_u32(in_range, offsets, vdupq_n_u32(0));
886 }
887
888 42176 inline uint8x16_t replace_pixel_with_border_u8_4ch(uint32x4_t in_range,
889 uint8x16_t pixels,
890 uint8x16_t v_border) {
891 42176 return vreinterpretq_u8_u32(
892 42176 vbslq_u32(in_range, vreinterpretq_u32_u8(pixels), v_border));
893 }
894
895 79552 inline uint16x8_t replace_pixel_with_border_u16_4ch(uint64x2_t in_range,
896 uint16x8_t pixels,
897 uint16x8_t v_border) {
898 79552 return vreinterpretq_u16_u64(
899 79552 vbslq_u64(in_range, vreinterpretq_u64_u16(pixels), v_border));
900 }
901
902 10544 inline void load_pixels_u8_4ch_constant(
903 Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
904 uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
905 uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
906 uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c,
907 uint8x16_t &d) {
908 10544 offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
909 10544 offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
910 10544 offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
911 10544 offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
912
913 10544 a = load_4px_4ch(src_rows, offsets_a);
914 10544 b = load_4px_4ch(src_rows, offsets_b);
915 10544 c = load_4px_4ch(src_rows, offsets_c);
916 10544 d = load_4px_4ch(src_rows, offsets_d);
917
918 10544 a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border);
919 10544 b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border);
920 10544 c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border);
921 10544 d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border);
922 10544 }
923
924 9944 inline void load_pixels_u16_4ch_constant(
925 Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
926 uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a,
927 uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d,
928 uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo,
929 uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo,
930 uint16x8_t &d_hi) {
931 9944 offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a);
932 9944 offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b);
933 9944 offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c);
934 9944 offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d);
935
936 9944 a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
937 9944 b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
938 9944 c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
939 9944 d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
940
941 // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
942 49720 auto low32_to_u64 = [](uint32x4_t bitset) {
943 39776 return vreinterpretq_u64_s64(
944 39776 vmovl_s32(vreinterpret_s32_u32(vget_low_u32(bitset))));
945 };
946
947 19888 a_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_a), a_lo,
948 9944 v_border);
949 19888 b_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_b), b_lo,
950 9944 v_border);
951 19888 c_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_c), c_lo,
952 9944 v_border);
953 19888 d_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_d), d_lo,
954 9944 v_border);
955
956 9944 a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
957 9944 b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
958 9944 c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
959 9944 d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
960
961 // Convert bitsets such as in_range to 64bits, making all 1s or all 0s
962 49720 auto hi32_to_u64 = [](uint32x4_t bitset) {
963 39776 return vreinterpretq_u64_s64(vmovl_high_s32(vreinterpretq_s32_u32(bitset)));
964 };
965
966 19888 a_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_a), a_hi,
967 9944 v_border);
968 19888 b_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_b), b_hi,
969 9944 v_border);
970 19888 c_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_c), c_hi,
971 9944 v_border);
972 19888 d_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_d), d_hi,
973 9944 v_border);
974 9944 }
975
976 // Convert bitsets such as in_range to 32bits, making all 1s or all 0s
977 40976 static uint32x4_t low16_to_s32(uint16x8_t bitset) {
978 40976 return vreinterpretq_u32_s32(
979 40976 vmovl_s16(vreinterpret_s16_u16(vget_low_u16(bitset))));
980 }
981
982 40976 static uint32x4_t hi16_to_s32(uint16x8_t bitset) {
983 40976 return vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(bitset)));
984 }
985
986 template <typename ScalarType>
987 class RemapS16Point5Constant4ch;
988
989 template <>
990 class RemapS16Point5Constant4ch<uint8_t> {
991 public:
992 using ScalarType = uint8_t;
993 using MapVecTraits = neon::VecTraits<int16_t>;
994
995 132 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
996 size_t src_height, const ScalarType *border_value)
997 132 : src_rows_{src_rows},
998 132 v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
999 132 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
1000 132 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
1001 132 v_border_{} {
1002 132 uint32_t border_value_32{};
1003 132 memcpy(&border_value_32, border_value, sizeof(uint32_t));
1004 132 v_border_ = vreinterpretq_u8_u32(vdupq_n_u32(border_value_32));
1005 132 }
1006
1007 156 void process_row(size_t width, Columns<const int16_t> mapxy,
1008 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
1009 5428 auto vector_path = [&](size_t step) {
1010 5272 uint16x8_t x0, y0, x1, y1;
1011 5272 uint16x8_t xfrac, yfrac;
1012 5272 uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
1013 5272 get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
1014 y1, xfrac, yfrac, in_range_a, in_range_b,
1015 in_range_c, in_range_d);
1016
1017 5272 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
1018 5272 uint8x16_t a, b, c, d;
1019 5272 uint8x16x2_t res;
1020
1021 10544 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
1022 5272 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
1023 5272 offsets_d, v_src_stride_);
1024
1025 5272 load_pixels_u8_4ch_constant(
1026 5272 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1027 5272 low16_to_s32(in_range_a), low16_to_s32(in_range_b),
1028 5272 low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a, b,
1029 c, d);
1030
1031 // Doubled fractions 001122..., low part
1032 5272 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
1033 5272 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
1034 10544 uint16x8_t nxfrac2 =
1035 5272 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1036 10544 uint16x8_t nyfrac2 =
1037 5272 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1038 // Quadrupled fractions (00001111) are passed to interpolate
1039 10544 uint16x8_t res0 = interpolate(
1040 5272 vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
1041 5272 vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
1042 5272 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
1043 10544 uint16x8_t res1 = interpolate(
1044 5272 vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
1045 5272 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
1046 5272 vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
1047 5272 res.val[0] =
1048 5272 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
1049
1050 10544 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
1051 5272 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
1052 5272 offsets_d, v_src_stride_);
1053
1054 5272 load_pixels_u8_4ch_constant(
1055 5272 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1056 5272 hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
1057 5272 hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a, b, c,
1058 d);
1059 // Doubled fractions 001122..., high part
1060 5272 xfrac2 = vzip2q(xfrac, xfrac);
1061 5272 yfrac2 = vzip2q(yfrac, yfrac);
1062 5272 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1063 5272 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1064 // Quadrupled fractions (00001111) are passed to interpolate
1065 10544 res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
1066 5272 vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
1067 5272 vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
1068 5272 vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
1069 10544 res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
1070 5272 vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
1071 5272 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1072 5272 vzip2q(nyfrac2, nyfrac2));
1073 5272 res.val[1] =
1074 5272 vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
1075
1076 5272 store_pixels_u8_4ch(res, dst);
1077 5272 mapxy += ptrdiff_t(step);
1078 5272 mapfrac += ptrdiff_t(step);
1079 5272 dst += ptrdiff_t(step);
1080 5272 };
1081
1082 156 LoopUnroll loop{width, MapVecTraits::num_lanes()};
1083 156 loop.unroll_once(vector_path);
1084 312 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
1085 156 static_cast<ptrdiff_t>(loop.remaining_length());
1086 156 mapxy -= back_step;
1087 156 mapfrac -= back_step;
1088 156 dst -= back_step;
1089 230 loop.remaining([&](size_t, size_t step) { vector_path(step); });
1090 156 }
1091
1092 private:
1093 Rows<const ScalarType> src_rows_;
1094 uint16x4_t v_src_stride_;
1095 uint16x8_t v_width_;
1096 uint16x8_t v_height_;
1097 uint8x16_t v_border_;
1098 }; // end of class RemapS16Point5Constant4ch<uint8_t>
1099
1100 template <>
1101 class RemapS16Point5Constant4ch<uint16_t> {
1102 public:
1103 using ScalarType = uint16_t;
1104 using MapVecTraits = neon::VecTraits<int16_t>;
1105
1106 132 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
1107 size_t src_height, const ScalarType *border_value)
1108 132 : src_rows_{src_rows},
1109 264 v_src_element_stride_{vdup_n_u16(
1110 132 static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
1111 132 v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
1112 132 v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
1113 132 v_border_{} {
1114 132 uint64_t border_value_64{};
1115 132 memcpy(&border_value_64, border_value, sizeof(uint64_t));
1116 132 v_border_ = vreinterpretq_u16_u64(vdupq_n_u64(border_value_64));
1117 132 }
1118
1119 156 void process_row(size_t width, Columns<const int16_t> mapxy,
1120 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
1121 5128 auto vector_path = [&](size_t step) {
1122 4972 uint16x8_t x0, y0, x1, y1;
1123 4972 uint16x8_t xfrac, yfrac;
1124 4972 uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d;
1125 4972 get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1,
1126 y1, xfrac, yfrac, in_range_a, in_range_b,
1127 in_range_c, in_range_d);
1128
1129 4972 uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
1130 4972 uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
1131 4972 uint16x8x4_t res;
1132
1133 9944 get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
1134 4972 vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
1135 4972 offsets_d, v_src_element_stride_);
1136
1137 4972 load_pixels_u16_4ch_constant(
1138 4972 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1139 4972 low16_to_s32(in_range_a), low16_to_s32(in_range_b),
1140 4972 low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a_low,
1141 a_high, b_low, b_high, c_low, c_high, d_low, d_high);
1142
1143 // Doubled fractions 001122..., low part
1144 4972 uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
1145 4972 uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
1146 9944 uint16x8_t nxfrac2 =
1147 4972 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1148 9944 uint16x8_t nyfrac2 =
1149 4972 vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1150 // Quadrupled fractions (00001111) are passed to interpolate
1151 4972 res.val[0] =
1152 9944 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
1153 4972 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
1154 4972 vzip1q(nyfrac2, nyfrac2));
1155 4972 res.val[1] =
1156 9944 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
1157 4972 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1158 4972 vzip2q(nyfrac2, nyfrac2));
1159
1160 9944 get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
1161 4972 vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
1162 4972 offsets_d, v_src_element_stride_);
1163
1164 4972 load_pixels_u16_4ch_constant(
1165 4972 src_rows_, offsets_a, offsets_b, offsets_c, offsets_d,
1166 4972 hi16_to_s32(in_range_a), hi16_to_s32(in_range_b),
1167 4972 hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a_low,
1168 a_high, b_low, b_high, c_low, c_high, d_low, d_high);
1169
1170 // Doubled fractions 001122..., high part
1171 4972 xfrac2 = vzip2q(xfrac, xfrac);
1172 4972 yfrac2 = vzip2q(yfrac, yfrac);
1173 4972 nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
1174 4972 nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
1175 // Quadrupled fractions (00001111) are passed to interpolate
1176 4972 res.val[2] =
1177 9944 interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
1178 4972 vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
1179 4972 vzip1q(nyfrac2, nyfrac2));
1180 4972 res.val[3] =
1181 9944 interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
1182 4972 vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
1183 4972 vzip2q(nyfrac2, nyfrac2));
1184
1185 4972 store_pixels_u16_4ch(res, dst);
1186 4972 mapxy += ptrdiff_t(step);
1187 4972 mapfrac += ptrdiff_t(step);
1188 4972 dst += ptrdiff_t(step);
1189 4972 };
1190
1191 156 LoopUnroll loop{width, MapVecTraits::num_lanes()};
1192 156 loop.unroll_once(vector_path);
1193 312 ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
1194 156 static_cast<ptrdiff_t>(loop.remaining_length());
1195 156 mapxy -= back_step;
1196 156 mapfrac -= back_step;
1197 156 dst -= back_step;
1198 230 loop.remaining([&](size_t, size_t step) { vector_path(step); });
1199 156 }
1200
1201 private:
1202 Rows<const ScalarType> src_rows_;
1203 uint16x4_t v_src_element_stride_;
1204 uint16x8_t v_width_;
1205 uint16x8_t v_height_;
1206 uint16x8_t v_border_;
1207 }; // end of class RemapS16Point5Constant4ch<uint16_t>
1208
1209 // Most of the complexity comes from parameter checking.
1210 // NOLINTBEGIN(readability-function-cognitive-complexity)
1211 template <typename T>
1212 1140 kleidicv_error_t remap_s16point5(
1213 const T *src, size_t src_stride, size_t src_width, size_t src_height,
1214 T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1215 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
1216 const uint16_t *mapfrac, size_t mapfrac_stride,
1217 [[maybe_unused]] kleidicv_border_type_t border_type,
1218 [[maybe_unused]] const T *border_value) {
1219
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 568 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 568 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 568 times.
1140 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1220
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 566 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 566 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 566 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 566 times.
1136 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1221
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 564 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 564 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 564 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 564 times.
1132 CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
1222
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 562 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 562 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 562 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 562 times.
1128 CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height);
1223
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 556 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 556 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 560 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 556 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 556 times.
1124 CHECK_IMAGE_SIZE(src_width, src_height);
1224
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 554 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 552 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 552 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 554 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 552 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 552 times.
1112 CHECK_IMAGE_SIZE(dst_width, dst_height);
1225
8/8
✓ Branch 0 taken 268 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 268 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 266 times.
✓ Branch 7 taken 2 times.
1104 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
1226 4 return KLEIDICV_ERROR_NULL_POINTER;
1227 }
1228
1229
8/8
✓ Branch 0 taken 530 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 530 times.
✓ Branch 3 taken 20 times.
✓ Branch 4 taken 530 times.
✓ Branch 5 taken 20 times.
✓ Branch 6 taken 530 times.
✓ Branch 7 taken 20 times.
2200 if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height,
1230 1100 dst_width, border_type, channels)) {
1231 40 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1232 }
1233
1234 1060 Rows<const T> src_rows{src, src_stride, channels};
1235 1060 Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
1236 1060 Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1};
1237 1060 Rows<T> dst_rows{dst, dst_stride, channels};
1238 1060 Rectangle rect{dst_width, dst_height};
1239
4/4
✓ Branch 0 taken 266 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 264 times.
1060 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
1240
4/4
✓ Branch 0 taken 132 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 132 times.
✓ Branch 3 taken 132 times.
528 if (channels == 1) {
1241 528 RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height,
1242 264 border_value};
1243 264 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1244 264 } else {
1245 assert(channels == 4);
1246 528 RemapS16Point5Constant4ch<T> operation{src_rows, src_width, src_height,
1247 264 border_value};
1248 264 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1249 264 }
1250 528 } else {
1251 assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
1252
4/4
✓ Branch 0 taken 134 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 134 times.
✓ Branch 3 taken 132 times.
532 if (channels == 1) {
1253 268 RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height};
1254 268 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1255 268 } else {
1256 assert(channels == 4);
1257 264 RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height};
1258 264 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1259 264 }
1260 }
1261 1060 return KLEIDICV_OK;
1262 1140 }
1263 // NOLINTEND(readability-function-cognitive-complexity)
1264
1265 #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \
1266 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \
1267 const type *src, size_t src_stride, size_t src_width, size_t src_height, \
1268 type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \
1269 size_t channels, const int16_t *mapxy, size_t mapxy_stride, \
1270 const uint16_t *mapfrac, size_t mapfrac_stride, \
1271 kleidicv_border_type_t border_type, const type *border_value)
1272
1273 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
1274 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
1275
1276 } // namespace kleidicv::neon
1277