KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/remap_s16point5_sve2.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 816 816 100.0%
Functions: 55 55 100.0%
Branches: 88 88 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cmath>
7 #include <cstddef>
8 #include <cstdint>
9
10 #include "kleidicv/sve2.h"
11 #include "kleidicv/traits.h"
12 #include "kleidicv/transform/remap.h"
13 #include "transform_sve2.h"
14
15 namespace kleidicv::sve2 {
16
17 template <typename ScalarType>
18 inline svuint16_t interpolate_16point5(svbool_t pg, svuint16_t frac,
19 svuint16_t src_a, svuint16_t src_b,
20 svuint16_t src_c, svuint16_t src_d,
21 svuint32_t bias);
22
23 template <>
24 10546 inline svuint16_t interpolate_16point5<uint8_t>(
25 svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b,
26 svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
27 10546 svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
28 21092 svuint16_t yfrac =
29 21092 svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
30 10546 svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
31 21092 svuint16_t nxfrac =
32 10546 svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
33 21092 svuint16_t nyfrac =
34 10546 svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
35 10546 svuint16_t line0 = svmla_x(pg, svmul_x(pg, xfrac, src_b), nxfrac, src_a);
36 10546 svuint16_t line1 = svmla_x(pg, svmul_x(pg, xfrac, src_d), nxfrac, src_c);
37
38 10546 svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac);
39 10546 svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac);
40 10546 acc_b = svmlalb_u32(acc_b, line1, yfrac);
41 10546 acc_t = svmlalt_u32(acc_t, line1, yfrac);
42
43 21092 return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
44 2ULL * REMAP16POINT5_FRAC_BITS);
45 10546 }
46
47 template <>
48 9946 inline svuint16_t interpolate_16point5<uint16_t>(
49 svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b,
50 svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
51 9946 svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
52 19892 svuint16_t yfrac =
53 19892 svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
54 9946 svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
55 19892 svuint16_t nxfrac =
56 9946 svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
57 19892 svuint16_t nyfrac =
58 9946 svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
59 19892 svuint32_t line0_b = svmla_x(pg, svmullb(xfrac, src_b), svmovlb_u32(nxfrac),
60 9946 svmovlb_u32(src_a));
61 19892 svuint32_t line0_t = svmla_x(pg, svmullt(xfrac, src_b), svmovlt_u32(nxfrac),
62 9946 svmovlt_u32(src_a));
63 19892 svuint32_t line1_b = svmla_x(pg, svmullb(xfrac, src_d), svmovlb_u32(nxfrac),
64 9946 svmovlb_u32(src_c));
65 19892 svuint32_t line1_t = svmla_x(pg, svmullt(xfrac, src_d), svmovlt_u32(nxfrac),
66 9946 svmovlt_u32(src_c));
67
68 19892 svuint32_t acc_b =
69 19892 svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_b, svmovlb_u32(nyfrac)),
70 9946 line1_b, svmovlb_u32(yfrac));
71 19892 svuint32_t acc_t =
72 19892 svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_t, svmovlt_u32(nyfrac)),
73 9946 line1_t, svmovlt_u32(yfrac));
74
75 19892 return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
76 2ULL * REMAP16POINT5_FRAC_BITS);
77 9946 }
78
79 template <typename ScalarType>
80 class RemapS16Point5Replicate;
81
82 template <>
83 class RemapS16Point5Replicate<uint8_t> {
84 public:
85 using ScalarType = uint8_t;
86 using MapVecTraits = VecTraits<int16_t>;
87 using MapVectorType = typename MapVecTraits::VectorType;
88 using MapVector2Type = typename MapVecTraits::Vector2Type;
89 using FracVecTraits = VecTraits<uint16_t>;
90 using FracVectorType = typename FracVecTraits::VectorType;
91
92 134 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
93 size_t src_height, svuint16_t& v_src_stride,
94 MapVectorType& v_x_max, MapVectorType& v_y_max)
95 134 : src_rows_{src_rows},
96 134 v_src_stride_{v_src_stride},
97 134 v_xmax_{v_x_max},
98 134 v_ymax_{v_y_max} {
99 134 v_src_stride_ = svdup_u16(src_rows.stride());
100 134 v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1));
101 134 v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1));
102 134 }
103
104 158 void process_row(size_t width, Columns<const int16_t> mapxy,
105 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
106 158 svuint16_t src_a, src_b, src_c, src_d;
107
108 158 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
109 5432 auto vector_path = [&](svbool_t pg, ptrdiff_t step) {
110 5274 load_source(pg, step, mapxy, src_a, src_b, src_c, src_d);
111 10548 interpolate_and_store(pg, step, mapfrac, dst, src_a, src_b, src_c, src_d,
112 5274 bias);
113 5274 };
114
115 158 LoopUnroll loop{width, MapVecTraits::num_lanes()};
116 5358 loop.unroll_once([&](size_t step) {
117 5200 svbool_t pg = MapVecTraits::svptrue();
118 5200 vector_path(pg, static_cast<ptrdiff_t>(step));
119 5200 });
120 232 loop.remaining([&](size_t length, size_t step) {
121 74 svbool_t pg = MapVecTraits::svwhilelt(step - length, step);
122 74 vector_path(pg, static_cast<ptrdiff_t>(length));
123 74 });
124 158 }
125
126 protected:
127 21096 svuint16_t gather_load_src(svbool_t pg_b, svuint32_t offsets_b, svbool_t pg_t,
128 svuint32_t offsets_t) {
129 42192 svuint32_t src_b =
130 21096 svldnt1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
131 42192 svuint32_t src_t =
132 21096 svldnt1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
133 63288 return svtrn1_u16(svreinterpret_u16_u32(src_b),
134 21096 svreinterpret_u16_u32(src_t));
135 21096 }
136
137 5274 void load_source(svbool_t pg, ptrdiff_t step, Columns<const int16_t>& mapxy,
138 svuint16_t& src_a, svuint16_t& src_b, svuint16_t& src_c,
139 svuint16_t& src_d) {
140 5274 MapVector2Type xy = svld2_s16(pg, &mapxy[0]);
141
142 // Clamp coordinates to within the dimensions of the source image
143 10548 svuint16_t x0 = svreinterpret_u16_s16(
144 5274 svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_)));
145 10548 svuint16_t y0 = svreinterpret_u16_s16(
146 5274 svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_)));
147
148 // x1 = x0 + 1, and clamp it too
149 10548 svuint16_t x1 = svreinterpret_u16_s16(
150 10548 svmax_x(pg, svdup_n_s16(0),
151 5274 svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_)));
152
153 10548 svuint16_t y1 = svreinterpret_u16_s16(
154 10548 svmax_x(pg, svdup_n_s16(0),
155 5274 svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_)));
156 5274 svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2);
157 5274 svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2);
158
159 // Calculate offsets from coordinates (y * stride + x)
160 5274 svuint32_t offsets_a_b = svmlalb_u32(svmovlb_u32(x0), y0, v_src_stride_);
161 5274 svuint32_t offsets_a_t = svmlalt_u32(svmovlt_u32(x0), y0, v_src_stride_);
162 5274 svuint32_t offsets_b_b = svmlalb_u32(svmovlb_u32(x1), y0, v_src_stride_);
163 5274 svuint32_t offsets_b_t = svmlalt_u32(svmovlt_u32(x1), y0, v_src_stride_);
164 5274 svuint32_t offsets_c_b = svmlalb_u32(svmovlb_u32(x0), y1, v_src_stride_);
165 5274 svuint32_t offsets_c_t = svmlalt_u32(svmovlt_u32(x0), y1, v_src_stride_);
166 5274 svuint32_t offsets_d_b = svmlalb_u32(svmovlb_u32(x1), y1, v_src_stride_);
167 5274 svuint32_t offsets_d_t = svmlalt_u32(svmovlt_u32(x1), y1, v_src_stride_);
168
169 // Load pixels from source
170 5274 src_a = gather_load_src(pg_b, offsets_a_b, pg_t, offsets_a_t);
171 5274 src_b = gather_load_src(pg_b, offsets_b_b, pg_t, offsets_b_t);
172 5274 src_c = gather_load_src(pg_b, offsets_c_b, pg_t, offsets_c_t);
173 5274 src_d = gather_load_src(pg_b, offsets_d_b, pg_t, offsets_d_t);
174 5274 mapxy += step;
175 5274 }
176
177 5274 void interpolate_and_store(svbool_t pg, ptrdiff_t step,
178 Columns<const uint16_t>& mapfrac,
179 Columns<ScalarType>& dst, svuint16_t src_a,
180 svuint16_t src_b, svuint16_t src_c,
181 svuint16_t src_d, svuint32_t bias) {
182 5274 FracVectorType frac = svld1_u16(pg, &mapfrac[0]);
183 10548 svuint16_t result = interpolate_16point5<uint8_t>(pg, frac, src_a, src_b,
184 5274 src_c, src_d, bias);
185 5274 svst1b_u16(pg, &dst[0], result);
186 5274 mapfrac += step;
187 5274 dst += step;
188 5274 }
189
190 Rows<const ScalarType> src_rows_;
191
192 private:
193 svuint16_t& v_src_stride_;
194 MapVectorType& v_xmax_;
195 MapVectorType& v_ymax_;
196 }; // end of class RemapS16Point5Replicate<uint8_t>
197
198 template <>
199 class RemapS16Point5Replicate<uint16_t> {
200 public:
201 using ScalarType = uint16_t;
202 using MapVecTraits = VecTraits<int16_t>;
203 using MapVectorType = typename MapVecTraits::VectorType;
204 using MapVector2Type = typename MapVecTraits::Vector2Type;
205 using FracVecTraits = VecTraits<uint16_t>;
206 using FracVectorType = typename FracVecTraits::VectorType;
207
208 134 RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width,
209 size_t src_height, svuint16_t& v_src_stride,
210 MapVectorType& v_x_max, MapVectorType& v_y_max)
211 134 : src_rows_{src_rows},
212 134 v_src_element_stride_{v_src_stride},
213 134 v_xmax_{v_x_max},
214 134 v_ymax_{v_y_max} {
215 134 v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
216 134 v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1));
217 134 v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1));
218 134 }
219
220 158 void process_row(size_t width, Columns<const int16_t> mapxy,
221 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
222 158 svuint16_t src_a, src_b, src_c, src_d;
223
224 158 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
225 5132 auto vector_path = [&](svbool_t pg, ptrdiff_t step) {
226 4974 load_source(pg, step, mapxy, src_a, src_b, src_c, src_d);
227 9948 interpolate_and_store(pg, step, mapfrac, dst, src_a, src_b, src_c, src_d,
228 4974 bias);
229 4974 };
230
231 158 LoopUnroll loop{width, MapVecTraits::num_lanes()};
232 5058 loop.unroll_once([&](size_t step) {
233 4900 svbool_t pg = MapVecTraits::svptrue();
234 4900 vector_path(pg, static_cast<ptrdiff_t>(step));
235 4900 });
236 232 loop.remaining([&](size_t length, size_t step) {
237 74 svbool_t pg = MapVecTraits::svwhilelt(step - length, step);
238 74 vector_path(pg, static_cast<ptrdiff_t>(length));
239 74 });
240 158 }
241
242 protected:
243 19896 svuint16_t gather_load_src(svbool_t pg_b, svuint32_t offsets_b, svbool_t pg_t,
244 svuint32_t offsets_t) {
245 // Account for the size of the source type when calculating offset
246 19896 offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1);
247 19896 offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1);
248
249 39792 svuint32_t src_b =
250 19896 svldnt1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
251 39792 svuint32_t src_t =
252 19896 svldnt1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
253 59688 return svtrn1_u16(svreinterpret_u16_u32(src_b),
254 19896 svreinterpret_u16_u32(src_t));
255 19896 }
256
257 4974 void load_source(svbool_t pg, ptrdiff_t step, Columns<const int16_t>& mapxy,
258 svuint16_t& src_a, svuint16_t& src_b, svuint16_t& src_c,
259 svuint16_t& src_d) {
260 4974 MapVector2Type xy = svld2_s16(pg, &mapxy[0]);
261
262 // Clamp coordinates to within the dimensions of the source image
263 9948 svuint16_t x0 = svreinterpret_u16_s16(
264 4974 svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_)));
265 9948 svuint16_t y0 = svreinterpret_u16_s16(
266 4974 svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_)));
267
268 // x1 = x0 + 1, and clamp it too
269 9948 svuint16_t x1 = svreinterpret_u16_s16(
270 9948 svmax_x(pg, svdup_n_s16(0),
271 4974 svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_)));
272
273 9948 svuint16_t y1 = svreinterpret_u16_s16(
274 9948 svmax_x(pg, svdup_n_s16(0),
275 4974 svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_)));
276 4974 svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2);
277 4974 svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2);
278
279 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
280 9948 svuint32_t offsets_a_b =
281 4974 svmlalb_u32(svmovlb_u32(x0), y0, v_src_element_stride_);
282 9948 svuint32_t offsets_a_t =
283 4974 svmlalt_u32(svmovlt_u32(x0), y0, v_src_element_stride_);
284 9948 svuint32_t offsets_b_b =
285 4974 svmlalb_u32(svmovlb_u32(x1), y0, v_src_element_stride_);
286 9948 svuint32_t offsets_b_t =
287 4974 svmlalt_u32(svmovlt_u32(x1), y0, v_src_element_stride_);
288 9948 svuint32_t offsets_c_b =
289 4974 svmlalb_u32(svmovlb_u32(x0), y1, v_src_element_stride_);
290 9948 svuint32_t offsets_c_t =
291 4974 svmlalt_u32(svmovlt_u32(x0), y1, v_src_element_stride_);
292 9948 svuint32_t offsets_d_b =
293 4974 svmlalb_u32(svmovlb_u32(x1), y1, v_src_element_stride_);
294 9948 svuint32_t offsets_d_t =
295 4974 svmlalt_u32(svmovlt_u32(x1), y1, v_src_element_stride_);
296
297 // Load pixels from source
298 4974 src_a = gather_load_src(pg_b, offsets_a_b, pg_t, offsets_a_t);
299 4974 src_b = gather_load_src(pg_b, offsets_b_b, pg_t, offsets_b_t);
300 4974 src_c = gather_load_src(pg_b, offsets_c_b, pg_t, offsets_c_t);
301 4974 src_d = gather_load_src(pg_b, offsets_d_b, pg_t, offsets_d_t);
302 4974 mapxy += step;
303 4974 }
304
305 4974 void interpolate_and_store(svbool_t pg, ptrdiff_t step,
306 Columns<const uint16_t>& mapfrac,
307 Columns<ScalarType>& dst, svuint16_t src_a,
308 svuint16_t src_b, svuint16_t src_c,
309 svuint16_t src_d, svuint32_t bias) {
310 4974 FracVectorType frac = svld1_u16(pg, &mapfrac[0]);
311 9948 svuint16_t result = interpolate_16point5<uint16_t>(pg, frac, src_a, src_b,
312 4974 src_c, src_d, bias);
313 4974 svst1_u16(pg, &dst[0], result);
314 4974 mapfrac += step;
315 4974 dst += step;
316 4974 }
317
318 Rows<const ScalarType> src_rows_;
319
320 private:
321 svuint16_t& v_src_element_stride_;
322 MapVectorType& v_xmax_;
323 MapVectorType& v_ymax_;
324 }; // end of class RemapS16Point5Replicate<uint16_t>
325
326 template <typename ScalarType>
327 class RemapS16Point5ConstantBorder;
328
329 template <>
330 class RemapS16Point5ConstantBorder<uint8_t> {
331 public:
332 using ScalarType = uint8_t;
333
334 132 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
335 size_t src_width, size_t src_height,
336 const ScalarType* border_value,
337 svuint16_t& v_src_stride, svuint16_t& v_width,
338 svuint16_t& v_height, svuint16_t& v_border)
339 132 : src_rows_{src_rows},
340 132 v_src_stride_{v_src_stride},
341 132 v_width_{v_width},
342 132 v_height_{v_height},
343 132 v_border_{v_border} {
344 132 v_src_stride_ = svdup_u16(src_rows.stride());
345 132 v_width_ = svdup_u16(static_cast<uint16_t>(src_width));
346 132 v_height_ = svdup_u16(static_cast<uint16_t>(src_height));
347 132 v_border_ = svdup_u16(*border_value);
348 132 }
349
350 156 void process_row(size_t width, Columns<const int16_t> mapxy,
351 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
352 156 svuint16_t one = svdup_n_u16(1);
353 156 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
354
2/2
✓ Branch 0 taken 156 times.
✓ Branch 1 taken 5272 times.
5428 for (size_t i = 0; i < width; i += svcnth()) {
355 5272 svbool_t pg = svwhilelt_b16_u64(i, width);
356
357 10544 svuint16x2_t xy =
358 10544 svld2_u16(pg, reinterpret_cast<const uint16_t*>(
359 5272 &mapxy[static_cast<ptrdiff_t>(i * 2)]));
360
361 5272 svuint16_t x0 = svget2(xy, 0);
362 5272 svuint16_t y0 = svget2(xy, 1);
363 5272 svuint16_t x1 = svadd_x(pg, x0, one);
364 5272 svuint16_t y1 = svadd_x(pg, y0, one);
365
366 10544 svuint16_t v00 = load_pixels_or_constant_border(
367 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y0);
368 10544 svuint16_t v01 = load_pixels_or_constant_border(
369 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y1);
370 10544 svuint16_t v10 = load_pixels_or_constant_border(
371 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y0);
372 10544 svuint16_t v11 = load_pixels_or_constant_border(
373 5272 src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y1);
374
375 5272 svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast<ptrdiff_t>(i)]);
376 10544 svuint16_t result =
377 5272 interpolate_16point5<uint8_t>(pg, frac, v00, v10, v01, v11, bias);
378
379 5272 svst1b_u16(pg, &dst[static_cast<ptrdiff_t>(i)], result);
380 5272 }
381 156 }
382
383 private:
384 21088 svuint16_t load_pixels_or_constant_border(Rows<const ScalarType> src_rows_,
385 svuint16_t& v_src_stride_,
386 svuint16_t& v_width_,
387 svuint16_t& v_height_,
388 svuint16_t& v_border_, svbool_t pg,
389 svuint16_t x, svuint16_t y) {
390 // Find whether coordinates are within the image dimensions.
391 42176 svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_),
392 21088 svcmplt_u16(pg, y, v_height_));
393
394 // Calculate offsets from coordinates (y * stride + x)
395 21088 svuint32_t offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_stride_);
396 21088 svuint32_t offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_stride_);
397
398 21088 svbool_t pg_b = in_range;
399 21088 svbool_t pg_t = svtrn2_b16(in_range, svpfalse());
400
401 // Copy pixels from source
402 42176 svuint32_t result_b =
403 21088 svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
404 42176 svuint32_t result_t =
405 21088 svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
406
407 42176 svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
408 21088 svreinterpret_u16_u32(result_t));
409
410 42176 return svsel(in_range, result, v_border_);
411 21088 }
412
413 Rows<const ScalarType> src_rows_;
414 svuint16_t& v_src_stride_;
415 svuint16_t& v_width_;
416 svuint16_t& v_height_;
417 svuint16_t& v_border_;
418 }; // end of class RemapS16Point5ConstantBorder<uint8_t>
419
420 template <>
421 class RemapS16Point5ConstantBorder<uint16_t> {
422 public:
423 using ScalarType = uint16_t;
424
425 132 RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows,
426 size_t src_width, size_t src_height,
427 const ScalarType* border_value,
428 svuint16_t& v_src_stride, svuint16_t& v_width,
429 svuint16_t& v_height, svuint16_t& v_border)
430 132 : src_rows_{src_rows},
431 132 v_src_element_stride_{v_src_stride},
432 132 v_width_{v_width},
433 132 v_height_{v_height},
434 132 v_border_{v_border} {
435 132 v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
436 132 v_width_ = svdup_u16(static_cast<uint16_t>(src_width));
437 132 v_height_ = svdup_u16(static_cast<uint16_t>(src_height));
438 132 v_border_ = svdup_u16(*border_value);
439 132 }
440
441 156 void process_row(size_t width, Columns<const int16_t> mapxy,
442 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
443 156 svuint16_t one = svdup_n_u16(1);
444 156 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
445
2/2
✓ Branch 0 taken 156 times.
✓ Branch 1 taken 4972 times.
5128 for (size_t i = 0; i < width; i += svcnth()) {
446 4972 svbool_t pg = svwhilelt_b16_u64(i, width);
447
448 9944 svuint16x2_t xy =
449 9944 svld2_u16(pg, reinterpret_cast<const uint16_t*>(
450 4972 &mapxy[static_cast<ptrdiff_t>(i * 2)]));
451
452 4972 svuint16_t x0 = svget2(xy, 0);
453 4972 svuint16_t y0 = svget2(xy, 1);
454 4972 svuint16_t x1 = svadd_x(pg, x0, one);
455 4972 svuint16_t y1 = svadd_x(pg, y0, one);
456
457 9944 svuint16_t v00 = load_pixels_or_constant_border(
458 4972 src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg,
459 4972 x0, y0);
460 9944 svuint16_t v01 = load_pixels_or_constant_border(
461 4972 src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg,
462 4972 x0, y1);
463 9944 svuint16_t v10 = load_pixels_or_constant_border(
464 4972 src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg,
465 4972 x1, y0);
466 9944 svuint16_t v11 = load_pixels_or_constant_border(
467 4972 src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg,
468 4972 x1, y1);
469
470 4972 svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast<ptrdiff_t>(i)]);
471 9944 svuint16_t result =
472 4972 interpolate_16point5<uint16_t>(pg, frac, v00, v10, v01, v11, bias);
473
474 4972 svst1_u16(pg, &dst[static_cast<ptrdiff_t>(i)], result);
475 4972 }
476 156 }
477
478 private:
479 19888 svuint16_t load_pixels_or_constant_border(Rows<const ScalarType> src_rows_,
480 svuint16_t& v_src_element_stride_,
481 svuint16_t& v_width_,
482 svuint16_t& v_height_,
483 svuint16_t& v_border_, svbool_t pg,
484 svuint16_t x, svuint16_t y) {
485 // Find whether coordinates are within the image dimensions.
486 39776 svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_),
487 19888 svcmplt_u16(pg, y, v_height_));
488
489 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
490 39776 svuint32_t offsets_b =
491 19888 svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_);
492 39776 svuint32_t offsets_t =
493 19888 svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_);
494
495 19888 svbool_t pg_b = in_range;
496 19888 svbool_t pg_t = svtrn2_b16(in_range, svpfalse());
497
498 // Account for the size of the source type when calculating offset
499 19888 offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1);
500 19888 offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1);
501
502 // Copy pixels from source
503 39776 svuint32_t result_b =
504 19888 svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
505 39776 svuint32_t result_t =
506 19888 svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
507
508 39776 svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
509 19888 svreinterpret_u16_u32(result_t));
510
511 39776 return svsel(in_range, result, v_border_);
512 19888 }
513
514 Rows<const ScalarType> src_rows_;
515 svuint16_t& v_src_element_stride_;
516 svuint16_t& v_width_;
517 svuint16_t& v_height_;
518 svuint16_t& v_border_;
519 }; // end of class RemapS16Point5ConstantBorder<uint16_t>
520
521 template <typename ScalarType>
522 class RemapS16Point5Replicate4ch;
523
524 template <>
525 class RemapS16Point5Replicate4ch<uint8_t> {
526 public:
527 using ScalarType = uint8_t;
528 using MapVecTraits = VecTraits<int16_t>;
529 using MapVectorType = typename MapVecTraits::VectorType;
530 using MapVector2Type = typename MapVecTraits::Vector2Type;
531 using FracVecTraits = VecTraits<uint16_t>;
532 using FracVectorType = typename FracVecTraits::VectorType;
533
534 132 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
535 size_t src_height, svuint16_t& v_src_stride,
536 MapVectorType& v_x_max, MapVectorType& v_y_max)
537 132 : src_rows_{src_rows},
538 132 v_src_stride_{v_src_stride},
539 132 v_xmax_{v_x_max},
540 132 v_ymax_{v_y_max} {
541 132 v_src_stride_ = svdup_u16(src_rows.stride());
542 132 v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1));
543 132 v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1));
544 132 }
545
546 156 void process_row(size_t width, Columns<const int16_t> mapxy,
547 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
548 156 LoopUnroll loop{width, MapVecTraits::num_lanes()};
549 5354 loop.unroll_once([&](size_t step) {
550 5198 svbool_t pg = MapVecTraits::svptrue();
551 5198 vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(step));
552 5198 });
553 230 loop.remaining([&](size_t length, size_t step) {
554 74 svbool_t pg = MapVecTraits::svwhilelt(step - length, step);
555 74 vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(length));
556 74 });
557 156 }
558
559 5272 void vector_path(svbool_t pg, Columns<const int16_t>& mapxy,
560 Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst,
561 ptrdiff_t step) {
562 5272 MapVector2Type xy = svld2_s16(pg, &mapxy[0]);
563 5272 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
564
565 // Clamp coordinates to within the dimensions of the source image
566 10544 svuint16_t x0 = svreinterpret_u16_s16(
567 5272 svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_)));
568 10544 svuint16_t y0 = svreinterpret_u16_s16(
569 5272 svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_)));
570
571 // x1 = x0 + 1, and clamp it too
572 10544 svuint16_t x1 = svreinterpret_u16_s16(
573 10544 svmax_x(pg, svdup_n_s16(0),
574 5272 svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_)));
575
576 10544 svuint16_t y1 = svreinterpret_u16_s16(
577 10544 svmax_x(pg, svdup_n_s16(0),
578 5272 svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_)));
579 5272 svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2);
580 5272 svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2);
581
582 // Calculate offsets from coordinates (y * stride + x), x multiplied by 4
583 // channels
584 26360 auto load_4ch_b = [&](svuint16_t x, svuint16_t y) {
585 21088 return svreinterpret_u8_u32(svld1_gather_u32offset_u32(
586 21088 pg_b, reinterpret_cast<const uint32_t*>(&src_rows_[0]),
587 21088 svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_)));
588 };
589 26360 auto load_4ch_t = [&](svuint16_t x, svuint16_t y) {
590 21088 return svreinterpret_u8_u32(svld1_gather_u32offset_u32(
591 21088 pg_t, reinterpret_cast<const uint32_t*>(&src_rows_[0]),
592 21088 svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_)));
593 };
594
595 5272 FracVectorType frac = svld1_u16(pg, &mapfrac[0]);
596 10544 svuint16_t xfrac =
597 5272 svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
598 10544 svuint16_t yfrac =
599 10544 svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
600 5272 svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
601
602 26360 auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac,
603 svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b,
604 svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
605 42176 svuint16_t line0 = svmla_x(
606 21088 svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a);
607 42176 svuint16_t line1 = svmla_x(
608 21088 svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c);
609
610 21088 svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac);
611 21088 svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac);
612 21088 acc_b = svmlalb_u32(acc_b, line1, yfrac);
613 21088 acc_t = svmlalt_u32(acc_t, line1, yfrac);
614
615 42176 return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
616 2ULL * REMAP16POINT5_FRAC_BITS);
617 21088 };
618
619 // bottom part
620 5272 svuint8_t a = load_4ch_b(x0, y0);
621 5272 svuint8_t b = load_4ch_b(x1, y0);
622 5272 svuint8_t c = load_4ch_b(x0, y1);
623 5272 svuint8_t d = load_4ch_b(x1, y1);
624 // from xfrac, we need the bottom part twice
625 5272 svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac);
626 10544 svuint16_t nxfrac2b = svsub_u16_x(
627 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b);
628 5272 svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac);
629 10544 svuint16_t nyfrac2b = svsub_u16_x(
630 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b);
631
632 // a,b,c,d looks like 12341234...(four channels)
633 // bottom is 1313...
634 10544 svuint16_t res_bb =
635 10544 lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a),
636 5272 svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias);
637 // top is 2424...
638 10544 svuint16_t res_bt =
639 10544 lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a),
640 5272 svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias);
641 10544 svuint8_t res_b =
642 5272 svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt));
643
644 // top part
645 5272 a = load_4ch_t(x0, y0);
646 5272 b = load_4ch_t(x1, y0);
647 5272 c = load_4ch_t(x0, y1);
648 5272 d = load_4ch_t(x1, y1);
649 // from xfrac, we need the top part twice
650 5272 svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac);
651 10544 svuint16_t nxfrac2t = svsub_u16_x(
652 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t);
653 5272 svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac);
654 10544 svuint16_t nyfrac2t = svsub_u16_x(
655 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t);
656
657 // a,b,c,d looks like 12341234...(four channels)
658 // bottom is 1313...
659 10544 svuint16_t res_tb =
660 10544 lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a),
661 5272 svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias);
662 // top is 2424...
663 10544 svuint16_t res_tt =
664 10544 lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a),
665 5272 svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias);
666 10544 svuint8_t res_t =
667 5272 svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt));
668
669 5272 svbool_t pg_low = svwhilelt_b32_u64(0L, static_cast<size_t>(step));
670 5272 svbool_t pg_high = svwhilelt_b32_u64(svcntw(), static_cast<size_t>(step));
671 10544 svuint32_t res_low =
672 5272 svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t));
673 10544 svuint32_t res_high =
674 5272 svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t));
675 5272 mapxy += step;
676 5272 svst1_u32(pg_low, reinterpret_cast<uint32_t*>(&dst[0]), res_low);
677 10544 svst1_u32(pg_high, reinterpret_cast<uint32_t*>(&dst[0]) + svcntw(),
678 5272 res_high);
679 5272 mapfrac += step;
680 5272 dst += step;
681 5272 }
682
683 Rows<const ScalarType> src_rows_;
684
685 private:
686 svuint16_t& v_src_stride_;
687 MapVectorType& v_xmax_;
688 MapVectorType& v_ymax_;
689 }; // end of class RemapS16Point5Replicate4ch<uint8_t>
690
691 template <>
692 class RemapS16Point5Replicate4ch<uint16_t> {
693 public:
694 using ScalarType = uint16_t;
695
696 132 RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
697 size_t src_height, svuint32_t& v_src_stride,
698 svint32_t& v_x_max, svint32_t& v_y_max)
699 132 : src_rows_{src_rows},
700 132 v_src_stride_{v_src_stride},
701 132 v_xmax_{v_x_max},
702 132 v_ymax_{v_y_max} {
703 132 v_src_stride_ = svdup_u32(src_rows.stride());
704 132 v_xmax_ = svdup_s32(static_cast<int32_t>(src_width - 1));
705 132 v_ymax_ = svdup_s32(static_cast<int32_t>(src_height - 1));
706 132 }
707
708 156 void process_row(size_t width, Columns<const int16_t> mapxy,
709 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
710 156 LoopUnroll loop{width, svcntw()};
711 10006 loop.unroll_once([&](size_t step) {
712 19700 vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), svptrue_b64(),
713 9850 svptrue_b64(), mapxy, mapfrac, dst,
714 9850 static_cast<ptrdiff_t>(step));
715 9850 });
716 216 loop.remaining([&](size_t length, size_t step) {
717 60 svbool_t pg = svwhilelt_b32_u64(step, step + length);
718 60 svbool_t pg64_b = svtrn1_b32(pg, svpfalse());
719 60 svbool_t pg64_t = svtrn2_b32(pg, svpfalse());
720 60 svbool_t pg_low = svzip1_b32(pg, svpfalse());
721 60 svbool_t pg_high = svzip2_b32(pg, svpfalse());
722 120 vector_path(pg, pg64_b, pg64_t, pg_low, pg_high, mapxy, mapfrac, dst,
723 60 static_cast<ptrdiff_t>(length));
724 60 });
725 156 }
726
727 9910 void vector_path(svbool_t pg, svbool_t pg64_b, svbool_t pg64_t,
728 svbool_t pg_low, svbool_t pg_high,
729 Columns<const int16_t>& mapxy,
730 Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst,
731 ptrdiff_t step) {
732 // Load one vector of xy: even coordinates are x, odd are y
733 19820 svint16_t xy = svreinterpret_s16_u32(
734 9910 svld1_u32(pg, reinterpret_cast<const uint32_t*>(&mapxy[0])));
735 9910 svint32_t x = svmovlb(xy);
736 9910 svint32_t y = svmovlt(xy);
737 // Clamp coordinates to within the dimensions of the source image
738 19820 svuint32_t x0 = svreinterpret_u32_s32(
739 9910 svmax_x(pg, svdup_n_s32(0), svmin_x(pg, x, v_xmax_)));
740 19820 svuint32_t y0 = svreinterpret_u32_s32(
741 9910 svmax_x(pg, svdup_n_s32(0), svmin_x(pg, y, v_ymax_)));
742
743 // x1 = x0 + 1, and clamp it too
744 19820 svuint32_t x1 = svreinterpret_u32_s32(svmax_x(
745 9910 pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, x, 1), v_xmax_)));
746 19820 svuint32_t y1 = svreinterpret_u32_s32(svmax_x(
747 9910 pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, y, 1), v_ymax_)));
748
749 89190 auto load_4ch = [&](svbool_t pg, svuint64_t offsets) {
750 79280 return svreinterpret_u16_u64(svld1_gather_u64offset_u64(
751 79280 pg, reinterpret_cast<const uint64_t*>(&src_rows_[0]), offsets));
752 };
753
754 9910 svuint16_t xfrac, yfrac, nxfrac, nyfrac;
755 {
756 // Fractions are loaded into even lanes
757 9910 svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0]));
758
759 // Fractions are doubled, 00112233... (will be doubled again later)
760 9910 svuint16_t frac = svtrn1(rawfrac, rawfrac);
761
762 9910 xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
763 19820 yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
764 9910 svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
765 9910 nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
766 9910 nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
767 9910 }
768
769 9910 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
770
771 29730 auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac,
772 svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b,
773 svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
774 19820 svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a);
775 19820 svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a);
776 19820 svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c);
777 19820 svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c);
778
779 39640 svuint32_t acc_b =
780 19820 svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac));
781 39640 svuint32_t acc_t =
782 19820 svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac));
783 19820 acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac));
784 19820 acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac));
785
786 39640 return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
787 2ULL * REMAP16POINT5_FRAC_BITS);
788 19820 };
789
790 // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit)
791 // Calculation is done in 2 parts, top and bottom
792 9910 svuint16_t res_b, res_t;
793
794 { // bottom
795 9910 svuint64_t x0w = svshllb_n_u64(x0, 3);
796 9910 svuint64_t x1w = svshllb_n_u64(x1, 3);
797 9910 svuint64_t ys0w = svmullb_u64(y0, v_src_stride_);
798 9910 svuint64_t ys1w = svmullb_u64(y1, v_src_stride_);
799 9910 svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w);
800 9910 svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w);
801 9910 svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w);
802 9910 svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w);
803
804 9910 svuint16_t a = load_4ch(pg64_b, offsets_a);
805 9910 svuint16_t b = load_4ch(pg64_b, offsets_b);
806 9910 svuint16_t c = load_4ch(pg64_b, offsets_c);
807 9910 svuint16_t d = load_4ch(pg64_b, offsets_d);
808
809 // Copy even lanes twice -> 000022224444... these are the "bottom"
810 // fractions
811 19820 svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32(
812 9910 svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac)));
813 19820 svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32(
814 9910 svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac)));
815 19820 svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32(
816 9910 svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac)));
817 19820 svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32(
818 9910 svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac)));
819
820 9910 res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias);
821 9910 }
822
823 { // top
824 9910 svuint64_t x0w = svshllt_n_u64(x0, 3);
825 9910 svuint64_t x1w = svshllt_n_u64(x1, 3);
826 9910 svuint64_t ys0w = svmullt_u64(y0, v_src_stride_);
827 9910 svuint64_t ys1w = svmullt_u64(y1, v_src_stride_);
828 9910 svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w);
829 9910 svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w);
830 9910 svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w);
831 9910 svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w);
832
833 9910 svuint16_t a = load_4ch(pg64_t, offsets_a);
834 9910 svuint16_t b = load_4ch(pg64_t, offsets_b);
835 9910 svuint16_t c = load_4ch(pg64_t, offsets_c);
836 9910 svuint16_t d = load_4ch(pg64_t, offsets_d);
837
838 // Copy odd lanes twice -> 111133335555... these are the "top"
839 // fractions
840 19820 svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32(
841 9910 svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac)));
842 19820 svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32(
843 9910 svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac)));
844 19820 svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32(
845 9910 svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac)));
846 19820 svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32(
847 9910 svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac)));
848
849 9910 res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias);
850 9910 }
851
852 19820 svuint64_t res_low =
853 9910 svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t));
854 19820 svuint64_t res_high =
855 9910 svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t));
856 9910 svst1_u64(pg_low, reinterpret_cast<uint64_t*>(&dst[0]), res_low);
857 19820 svst1_u64(pg_high, reinterpret_cast<uint64_t*>(&dst[0]) + svcntd(),
858 9910 res_high);
859 9910 mapxy += step;
860 9910 mapfrac += step;
861 9910 dst += step;
862 9910 }
863
864 Rows<const ScalarType> src_rows_;
865
866 private:
867 svuint32_t& v_src_stride_;
868 svint32_t& v_xmax_;
869 svint32_t& v_ymax_;
870 }; // end of class RemapS16Point5Replicate4ch<uint16_t>
871
872 template <typename ScalarType>
873 class RemapS16Point5Constant4ch;
874
875 template <>
876 class RemapS16Point5Constant4ch<uint8_t> {
877 public:
878 using ScalarType = uint8_t;
879
880 132 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
881 size_t src_height, const ScalarType* border_value,
882 svuint16_t& v_src_stride, svuint16_t& v_x_max,
883 svuint16_t& v_y_max, svuint32_t& v_border)
884 132 : src_rows_{src_rows},
885 132 v_src_stride_{v_src_stride},
886 132 v_xmax_{v_x_max},
887 132 v_ymax_{v_y_max},
888 132 v_border_{v_border} {
889 132 v_src_stride_ = svdup_u16(src_rows.stride());
890 132 v_xmax_ = svdup_u16(static_cast<uint16_t>(src_width - 1));
891 132 v_ymax_ = svdup_u16(static_cast<uint16_t>(src_height - 1));
892 132 uint32_t border_value_u32{};
893 132 memcpy(&border_value_u32, border_value, sizeof(uint32_t));
894 132 v_border_ = svdup_u32(border_value_u32);
895 132 }
896
897 156 void process_row(size_t width, Columns<const int16_t> mapxy,
898 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
899 156 LoopUnroll loop{width, svcnth()};
900 5354 loop.unroll_once([&](size_t step) {
901 5198 svbool_t pg = svptrue_b16();
902 5198 vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(step));
903 5198 });
904 230 loop.remaining([&](size_t length, size_t step) {
905 74 svbool_t pg = svwhilelt_b16_u64(step - length, step);
906 74 vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(length));
907 74 });
908 156 }
909
910 5272 void vector_path(svbool_t pg, Columns<const int16_t>& mapxy,
911 Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst,
912 ptrdiff_t step) {
913 10544 svuint16x2_t xy =
914 5272 svld2_u16(pg, reinterpret_cast<const uint16_t*>(&mapxy[0]));
915 5272 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
916
917 // Negative values become big positive ones
918 5272 svuint16_t x0 = svget2(xy, 0);
919 5272 svuint16_t y0 = svget2(xy, 1);
920 5272 svuint16_t x1 = svadd_n_u16_x(pg, x0, 1);
921 5272 svuint16_t y1 = svadd_n_u16_x(pg, y0, 1);
922
923 // Calculate offsets from coordinates (y * stride + x), x multiplied by 4
924 // channels
925 26360 auto load_4ch_or_border_b = [&](svuint16_t x, svuint16_t y) {
926 42176 svbool_t in_range_b16 =
927 21088 svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_));
928 21088 svbool_t in_range = svtrn1_b16(in_range_b16, svpfalse());
929 42176 svuint32_t image = svld1_gather_u32offset_u32(
930 21088 in_range, reinterpret_cast<const uint32_t*>(&src_rows_[0]),
931 21088 svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_));
932 42176 return svreinterpret_u8_u32(svsel(in_range, image, v_border_));
933 21088 };
934 26360 auto load_4ch_or_border_t = [&](svuint16_t x, svuint16_t y) {
935 42176 svbool_t in_range_b16 =
936 21088 svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_));
937 21088 svbool_t in_range = svtrn2_b16(in_range_b16, svpfalse());
938 42176 svuint32_t image = svld1_gather_u32offset_u32(
939 21088 in_range, reinterpret_cast<const uint32_t*>(&src_rows_[0]),
940 21088 svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_));
941 42176 return svreinterpret_u8_u32(svsel(in_range, image, v_border_));
942 21088 };
943
944 5272 svuint16_t frac = svld1_u16(pg, &mapfrac[0]);
945 10544 svuint16_t xfrac =
946 5272 svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
947 10544 svuint16_t yfrac =
948 10544 svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
949 5272 svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
950
951 26360 auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac,
952 svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b,
953 svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
954 42176 svuint16_t line0 = svmla_x(
955 21088 svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a);
956 42176 svuint16_t line1 = svmla_x(
957 21088 svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c);
958
959 21088 svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac);
960 21088 svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac);
961 21088 acc_b = svmlalb_u32(acc_b, line1, yfrac);
962 21088 acc_t = svmlalt_u32(acc_t, line1, yfrac);
963
964 42176 return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
965 2ULL * REMAP16POINT5_FRAC_BITS);
966 21088 };
967
968 // bottom part
969 5272 svuint8_t a = load_4ch_or_border_b(x0, y0);
970 5272 svuint8_t b = load_4ch_or_border_b(x1, y0);
971 5272 svuint8_t c = load_4ch_or_border_b(x0, y1);
972 5272 svuint8_t d = load_4ch_or_border_b(x1, y1);
973 // from xfrac, we need the bottom part twice
974 5272 svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac);
975 10544 svuint16_t nxfrac2b = svsub_u16_x(
976 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b);
977 5272 svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac);
978 10544 svuint16_t nyfrac2b = svsub_u16_x(
979 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b);
980
981 // a,b,c,d looks like 12341234...(four channels)
982 // bottom is 1313...
983 10544 svuint16_t res_bb =
984 10544 lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a),
985 5272 svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias);
986 // top is 2424...
987 10544 svuint16_t res_bt =
988 10544 lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a),
989 5272 svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias);
990 10544 svuint8_t res_b =
991 5272 svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt));
992
993 // top part
994 5272 a = load_4ch_or_border_t(x0, y0);
995 5272 b = load_4ch_or_border_t(x1, y0);
996 5272 c = load_4ch_or_border_t(x0, y1);
997 5272 d = load_4ch_or_border_t(x1, y1);
998 // from xfrac, we need the top part twice
999 5272 svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac);
1000 10544 svuint16_t nxfrac2t = svsub_u16_x(
1001 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t);
1002 5272 svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac);
1003 10544 svuint16_t nyfrac2t = svsub_u16_x(
1004 5272 svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t);
1005
1006 // a,b,c,d looks like 12341234...(four channels)
1007 // bottom is 1313...
1008 10544 svuint16_t res_tb =
1009 10544 lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a),
1010 5272 svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias);
1011 // top is 2424...
1012 10544 svuint16_t res_tt =
1013 10544 lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a),
1014 5272 svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias);
1015 10544 svuint8_t res_t =
1016 5272 svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt));
1017
1018 5272 svbool_t pg_low = svwhilelt_b32_u64(0L, static_cast<size_t>(step));
1019 5272 svbool_t pg_high = svwhilelt_b32_u64(svcntw(), static_cast<size_t>(step));
1020 10544 svuint32_t res_low =
1021 5272 svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t));
1022 10544 svuint32_t res_high =
1023 5272 svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t));
1024 5272 mapxy += step;
1025 5272 svst1_u32(pg_low, reinterpret_cast<uint32_t*>(&dst[0]), res_low);
1026 10544 svst1_u32(pg_high, reinterpret_cast<uint32_t*>(&dst[0]) + svcntw(),
1027 5272 res_high);
1028 5272 mapfrac += step;
1029 5272 dst += step;
1030 5272 }
1031
1032 Rows<const ScalarType> src_rows_;
1033
1034 private:
1035 svuint16_t& v_src_stride_;
1036 svuint16_t& v_xmax_;
1037 svuint16_t& v_ymax_;
1038 svuint32_t& v_border_;
1039 }; // end of class RemapS16Point5Constant4ch<uint8_t>
1040
1041 template <>
1042 class RemapS16Point5Constant4ch<uint16_t> {
1043 public:
1044 using ScalarType = uint16_t;
1045
1046 132 RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width,
1047 size_t src_height, const ScalarType* border_value,
1048 svuint32_t& v_src_stride, svuint32_t& v_x_max,
1049 svuint32_t& v_y_max, svuint64_t& v_border)
1050 132 : src_rows_{src_rows},
1051 132 v_src_stride_{v_src_stride},
1052 132 v_xmax_{v_x_max},
1053 132 v_ymax_{v_y_max},
1054 132 v_border_{v_border} {
1055 132 v_src_stride_ = svdup_u32(src_rows.stride());
1056 132 v_xmax_ = svdup_u32(static_cast<uint32_t>(src_width - 1));
1057 132 v_ymax_ = svdup_u32(static_cast<uint32_t>(src_height - 1));
1058 132 uint64_t border_value_u64{};
1059 132 memcpy(&border_value_u64, border_value, sizeof(uint64_t));
1060 132 v_border_ = svdup_u64(border_value_u64);
1061 132 }
1062
1063 156 void process_row(size_t width, Columns<const int16_t> mapxy,
1064 Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
1065 156 LoopUnroll loop{width, svcntw()};
1066 10006 loop.unroll_once([&](size_t step) {
1067 19700 vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), mapxy, mapfrac,
1068 9850 dst, static_cast<ptrdiff_t>(step));
1069 9850 });
1070 216 loop.remaining([&](size_t length, size_t step) {
1071 60 svbool_t pg = svwhilelt_b32_u64(step, step + length);
1072 60 svbool_t pg_low = svzip1_b32(pg, svpfalse());
1073 60 svbool_t pg_high = svzip2_b32(pg, svpfalse());
1074 120 vector_path(pg, pg_low, pg_high, mapxy, mapfrac, dst,
1075 60 static_cast<ptrdiff_t>(length));
1076 60 });
1077 156 }
1078
1079 9910 void vector_path(svbool_t pg, svbool_t pg_low, svbool_t pg_high,
1080 Columns<const int16_t>& mapxy,
1081 Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst,
1082 ptrdiff_t step) {
1083 // Load one vector of xy: even coordinates are x, odd are y
1084 19820 svint16_t xy = svreinterpret_s16_u32(
1085 9910 svld1_u32(pg, reinterpret_cast<const uint32_t*>(&mapxy[0])));
1086
1087 // Negative values become big positive ones
1088 // Widening is signed, so 16-bit -1 becomes 32-bit -1
1089 9910 svuint32_t x0 = svreinterpret_u32_s32(svmovlb(xy));
1090 9910 svuint32_t y0 = svreinterpret_u32_s32(svmovlt(xy));
1091 9910 svuint32_t x1 = svadd_n_u32_x(pg, x0, 1);
1092 9910 svuint32_t y1 = svadd_n_u32_x(pg, y0, 1);
1093
1094 49550 auto load_4ch_or_border_b = [&](svuint32_t x, svuint32_t y) {
1095 79280 svbool_t in_range_b32 =
1096 39640 svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_));
1097 39640 svbool_t in_range = svtrn1_b32(in_range_b32, svpfalse());
1098 79280 svuint64_t image = svld1_gather_u64offset_u64(
1099 39640 in_range, reinterpret_cast<const uint64_t*>(&src_rows_[0]),
1100 39640 svmlalb_u64(svshllb_n_u64(x, 3), y, v_src_stride_));
1101 79280 return svreinterpret_u16_u64(svsel(in_range, image, v_border_));
1102 39640 };
1103
1104 49550 auto load_4ch_or_border_t = [&](svuint32_t x, svuint32_t y) {
1105 79280 svbool_t in_range_b32 =
1106 39640 svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_));
1107 39640 svbool_t in_range = svtrn2_b32(in_range_b32, svpfalse());
1108 79280 svuint64_t image = svld1_gather_u64offset_u64(
1109 39640 in_range, reinterpret_cast<const uint64_t*>(&src_rows_[0]),
1110 39640 svmlalt_u64(svshllt_n_u64(x, 3), y, v_src_stride_));
1111 79280 return svreinterpret_u16_u64(svsel(in_range, image, v_border_));
1112 39640 };
1113
1114 9910 svuint16_t xfrac, yfrac, nxfrac, nyfrac;
1115 {
1116 // Fractions are loaded into even lanes
1117 9910 svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0]));
1118
1119 // Fractions are doubled, 00112233... (will be doubled again later)
1120 9910 svuint16_t frac = svtrn1(rawfrac, rawfrac);
1121
1122 9910 xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
1123 19820 yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
1124 9910 svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
1125 9910 nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
1126 9910 nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
1127 9910 }
1128
1129 9910 svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
1130
1131 29730 auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac,
1132 svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b,
1133 svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
1134 19820 svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a);
1135 19820 svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a);
1136 19820 svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c);
1137 19820 svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c);
1138
1139 39640 svuint32_t acc_b =
1140 19820 svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac));
1141 39640 svuint32_t acc_t =
1142 19820 svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac));
1143 19820 acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac));
1144 19820 acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac));
1145
1146 39640 return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
1147 2ULL * REMAP16POINT5_FRAC_BITS);
1148 19820 };
1149
1150 // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit)
1151 // Calculation is done in 2 parts, top and bottom
1152 9910 svuint16_t res_b, res_t;
1153
1154 { // bottom
1155 9910 svuint16_t a = load_4ch_or_border_b(x0, y0);
1156 9910 svuint16_t b = load_4ch_or_border_b(x1, y0);
1157 9910 svuint16_t c = load_4ch_or_border_b(x0, y1);
1158 9910 svuint16_t d = load_4ch_or_border_b(x1, y1);
1159
1160 // Copy even lanes twice -> 000022224444... these are the "bottom"
1161 // fractions
1162 19820 svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32(
1163 9910 svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac)));
1164 19820 svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32(
1165 9910 svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac)));
1166 19820 svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32(
1167 9910 svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac)));
1168 19820 svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32(
1169 9910 svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac)));
1170
1171 9910 res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias);
1172 9910 }
1173
1174 { // top
1175 9910 svuint16_t a = load_4ch_or_border_t(x0, y0);
1176 9910 svuint16_t b = load_4ch_or_border_t(x1, y0);
1177 9910 svuint16_t c = load_4ch_or_border_t(x0, y1);
1178 9910 svuint16_t d = load_4ch_or_border_t(x1, y1);
1179
1180 // Copy odd lanes twice -> 111133335555... these are the "top"
1181 // fractions
1182 19820 svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32(
1183 9910 svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac)));
1184 19820 svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32(
1185 9910 svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac)));
1186 19820 svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32(
1187 9910 svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac)));
1188 19820 svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32(
1189 9910 svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac)));
1190
1191 9910 res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias);
1192 9910 }
1193
1194 19820 svuint64_t res_low =
1195 9910 svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t));
1196 19820 svuint64_t res_high =
1197 9910 svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t));
1198 9910 svst1_u64(pg_low, reinterpret_cast<uint64_t*>(&dst[0]), res_low);
1199 19820 svst1_u64(pg_high, reinterpret_cast<uint64_t*>(&dst[0]) + svcntd(),
1200 9910 res_high);
1201 9910 mapxy += step;
1202 9910 mapfrac += step;
1203 9910 dst += step;
1204 9910 }
1205
1206 Rows<const ScalarType> src_rows_;
1207
1208 private:
1209 svuint32_t& v_src_stride_;
1210 svuint32_t& v_xmax_;
1211 svuint32_t& v_ymax_;
1212 svuint64_t& v_border_;
1213 }; // end of class RemapS16Point5Constant4ch<uint16_t>
1214
1215 // Most of the complexity comes from parameter checking.
1216 // NOLINTBEGIN(readability-function-cognitive-complexity)
1217 template <typename T>
1218 1140 kleidicv_error_t remap_s16point5(const T* src, size_t src_stride,
1219 size_t src_width, size_t src_height, T* dst,
1220 size_t dst_stride, size_t dst_width,
1221 size_t dst_height, size_t channels,
1222 const int16_t* mapxy, size_t mapxy_stride,
1223 const uint16_t* mapfrac, size_t mapfrac_stride,
1224 kleidicv_border_type_t border_type,
1225 const T* border_value) {
1226
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 568 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 568 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 568 times.
1140 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1227
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 566 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 566 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 566 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 566 times.
1136 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1228
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 564 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 564 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 564 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 564 times.
1132 CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
1229
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 562 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 562 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 562 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 562 times.
1128 CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height);
1230
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 556 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 556 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 560 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 556 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 556 times.
1124 CHECK_IMAGE_SIZE(src_width, src_height);
1231
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 554 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 552 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 552 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 554 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 552 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 552 times.
1112 CHECK_IMAGE_SIZE(dst_width, dst_height);
1232
8/8
✓ Branch 0 taken 268 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 268 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 266 times.
✓ Branch 7 taken 2 times.
1104 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
1233 4 return KLEIDICV_ERROR_NULL_POINTER;
1234 }
1235
1236
8/8
✓ Branch 0 taken 530 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 530 times.
✓ Branch 3 taken 20 times.
✓ Branch 4 taken 530 times.
✓ Branch 5 taken 20 times.
✓ Branch 6 taken 530 times.
✓ Branch 7 taken 20 times.
2200 if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height,
1237 1100 dst_width, border_type, channels)) {
1238 40 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1239 }
1240
1241 1060 Rows<const T> src_rows{src, src_stride, channels};
1242 1060 Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
1243 1060 Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1};
1244 1060 Rows<T> dst_rows{dst, dst_stride, channels};
1245 1060 svuint16_t sv_src_stride;
1246 1060 Rectangle rect{dst_width, dst_height};
1247
1248
4/4
✓ Branch 0 taken 266 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 264 times.
1060 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
1249
4/4
✓ Branch 0 taken 132 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 132 times.
✓ Branch 3 taken 132 times.
528 if (channels == 1) {
1250 264 svuint16_t sv_width, sv_height, sv_border;
1251 528 RemapS16Point5ConstantBorder<T> operation{
1252 264 src_rows, src_width, src_height, border_value,
1253 sv_src_stride, sv_width, sv_height, sv_border};
1254 264 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1255 264 } else {
1256 assert(channels == 4);
1257 typedef typename double_element_width<T>::type DoubleType;
1258 typedef typename double_element_width<DoubleType>::type QuadType;
1259 264 typename VecTraits<DoubleType>::VectorType sv_width, sv_height,
1260 sv_src_stride;
1261 264 typename VecTraits<QuadType>::VectorType sv_border;
1262 528 RemapS16Point5Constant4ch<T> operation{
1263 264 src_rows, src_width, src_height, border_value,
1264 sv_src_stride, sv_width, sv_height, sv_border};
1265 264 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1266 264 }
1267 528 } else {
1268 assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
1269 532 svint16_t sv_xmax, sv_ymax;
1270
4/4
✓ Branch 0 taken 134 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 134 times.
✓ Branch 3 taken 132 times.
532 if (channels == 1) {
1271 268 RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height,
1272 sv_src_stride, sv_xmax, sv_ymax};
1273 268 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1274 268 } else {
1275 assert(channels == 4);
1276 if constexpr (std::is_same<T, uint8_t>::value) {
1277 264 RemapS16Point5Replicate4ch<T> operation{
1278 132 src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax};
1279 132 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1280 132 }
1281 if constexpr (std::is_same<T, uint16_t>::value) {
1282 132 svuint32_t stride;
1283 132 svint32_t xmax, ymax;
1284 132 RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height,
1285 stride, xmax, ymax};
1286 132 zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
1287 132 }
1288 }
1289 532 }
1290 1060 return KLEIDICV_OK;
1291 1140 }
1292 // NOLINTEND(readability-function-cognitive-complexity)
1293
1294 #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \
1295 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \
1296 const type* src, size_t src_stride, size_t src_width, size_t src_height, \
1297 type* dst, size_t dst_stride, size_t dst_width, size_t dst_height, \
1298 size_t channels, const int16_t* mapxy, size_t mapxy_stride, \
1299 const uint16_t* mapfrac, size_t mapfrac_stride, \
1300 kleidicv_border_type_t border_type, const type* border_value)
1301
1302 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
1303 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
1304
1305 } // namespace kleidicv::sve2
1306