Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cassert> | ||
6 | #include <cmath> | ||
7 | #include <cstddef> | ||
8 | #include <cstdint> | ||
9 | |||
10 | #include "kleidicv/sve2.h" | ||
11 | #include "kleidicv/traits.h" | ||
12 | #include "kleidicv/transform/remap.h" | ||
13 | #include "transform_sve2.h" | ||
14 | |||
15 | namespace kleidicv::sve2 { | ||
16 | |||
17 | template <typename ScalarType> | ||
18 | inline svuint16_t interpolate_16point5(svbool_t pg, svuint16_t frac, | ||
19 | svuint16_t src_a, svuint16_t src_b, | ||
20 | svuint16_t src_c, svuint16_t src_d, | ||
21 | svuint32_t bias); | ||
22 | |||
23 | template <> | ||
24 | 10546 | inline svuint16_t interpolate_16point5<uint8_t>( | |
25 | svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b, | ||
26 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
27 | 10546 | svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
28 | 21092 | svuint16_t yfrac = | |
29 | 21092 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
30 | 10546 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
31 | 21092 | svuint16_t nxfrac = | |
32 | 10546 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
33 | 21092 | svuint16_t nyfrac = | |
34 | 10546 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
35 | 10546 | svuint16_t line0 = svmla_x(pg, svmul_x(pg, xfrac, src_b), nxfrac, src_a); | |
36 | 10546 | svuint16_t line1 = svmla_x(pg, svmul_x(pg, xfrac, src_d), nxfrac, src_c); | |
37 | |||
38 | 10546 | svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); | |
39 | 10546 | svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); | |
40 | 10546 | acc_b = svmlalb_u32(acc_b, line1, yfrac); | |
41 | 10546 | acc_t = svmlalt_u32(acc_t, line1, yfrac); | |
42 | |||
43 | 21092 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
44 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
45 | 10546 | } | |
46 | |||
47 | template <> | ||
48 | 9946 | inline svuint16_t interpolate_16point5<uint16_t>( | |
49 | svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b, | ||
50 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
51 | 9946 | svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
52 | 19892 | svuint16_t yfrac = | |
53 | 19892 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
54 | 9946 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
55 | 19892 | svuint16_t nxfrac = | |
56 | 9946 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
57 | 19892 | svuint16_t nyfrac = | |
58 | 9946 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
59 | 19892 | svuint32_t line0_b = svmla_x(pg, svmullb(xfrac, src_b), svmovlb_u32(nxfrac), | |
60 | 9946 | svmovlb_u32(src_a)); | |
61 | 19892 | svuint32_t line0_t = svmla_x(pg, svmullt(xfrac, src_b), svmovlt_u32(nxfrac), | |
62 | 9946 | svmovlt_u32(src_a)); | |
63 | 19892 | svuint32_t line1_b = svmla_x(pg, svmullb(xfrac, src_d), svmovlb_u32(nxfrac), | |
64 | 9946 | svmovlb_u32(src_c)); | |
65 | 19892 | svuint32_t line1_t = svmla_x(pg, svmullt(xfrac, src_d), svmovlt_u32(nxfrac), | |
66 | 9946 | svmovlt_u32(src_c)); | |
67 | |||
68 | 19892 | svuint32_t acc_b = | |
69 | 19892 | svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_b, svmovlb_u32(nyfrac)), | |
70 | 9946 | line1_b, svmovlb_u32(yfrac)); | |
71 | 19892 | svuint32_t acc_t = | |
72 | 19892 | svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_t, svmovlt_u32(nyfrac)), | |
73 | 9946 | line1_t, svmovlt_u32(yfrac)); | |
74 | |||
75 | 19892 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
76 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
77 | 9946 | } | |
78 | |||
79 | template <typename ScalarType> | ||
80 | class RemapS16Point5Replicate; | ||
81 | |||
82 | template <> | ||
83 | class RemapS16Point5Replicate<uint8_t> { | ||
84 | public: | ||
85 | using ScalarType = uint8_t; | ||
86 | using MapVecTraits = VecTraits<int16_t>; | ||
87 | using MapVectorType = typename MapVecTraits::VectorType; | ||
88 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
89 | using FracVecTraits = VecTraits<uint16_t>; | ||
90 | using FracVectorType = typename FracVecTraits::VectorType; | ||
91 | |||
92 | 134 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
93 | size_t src_height, svuint16_t& v_src_stride, | ||
94 | MapVectorType& v_x_max, MapVectorType& v_y_max) | ||
95 | 134 | : src_rows_{src_rows}, | |
96 | 134 | v_src_stride_{v_src_stride}, | |
97 | 134 | v_xmax_{v_x_max}, | |
98 | 134 | v_ymax_{v_y_max} { | |
99 | 134 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
100 | 134 | v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1)); | |
101 | 134 | v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1)); | |
102 | 134 | } | |
103 | |||
104 | 158 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
105 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
106 | 158 | svuint16_t src_a, src_b, src_c, src_d; | |
107 | |||
108 | 158 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
109 | 5432 | auto vector_path = [&](svbool_t pg, ptrdiff_t step) { | |
110 | 5274 | load_source(pg, step, mapxy, src_a, src_b, src_c, src_d); | |
111 | 10548 | interpolate_and_store(pg, step, mapfrac, dst, src_a, src_b, src_c, src_d, | |
112 | 5274 | bias); | |
113 | 5274 | }; | |
114 | |||
115 | 158 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
116 | 5358 | loop.unroll_once([&](size_t step) { | |
117 | 5200 | svbool_t pg = MapVecTraits::svptrue(); | |
118 | 5200 | vector_path(pg, static_cast<ptrdiff_t>(step)); | |
119 | 5200 | }); | |
120 | 232 | loop.remaining([&](size_t length, size_t step) { | |
121 | 74 | svbool_t pg = MapVecTraits::svwhilelt(step - length, step); | |
122 | 74 | vector_path(pg, static_cast<ptrdiff_t>(length)); | |
123 | 74 | }); | |
124 | 158 | } | |
125 | |||
126 | protected: | ||
127 | 21096 | svuint16_t gather_load_src(svbool_t pg_b, svuint32_t offsets_b, svbool_t pg_t, | |
128 | svuint32_t offsets_t) { | ||
129 | 42192 | svuint32_t src_b = | |
130 | 21096 | svldnt1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
131 | 42192 | svuint32_t src_t = | |
132 | 21096 | svldnt1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
133 | 63288 | return svtrn1_u16(svreinterpret_u16_u32(src_b), | |
134 | 21096 | svreinterpret_u16_u32(src_t)); | |
135 | 21096 | } | |
136 | |||
137 | 5274 | void load_source(svbool_t pg, ptrdiff_t step, Columns<const int16_t>& mapxy, | |
138 | svuint16_t& src_a, svuint16_t& src_b, svuint16_t& src_c, | ||
139 | svuint16_t& src_d) { | ||
140 | 5274 | MapVector2Type xy = svld2_s16(pg, &mapxy[0]); | |
141 | |||
142 | // Clamp coordinates to within the dimensions of the source image | ||
143 | 10548 | svuint16_t x0 = svreinterpret_u16_s16( | |
144 | 5274 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); | |
145 | 10548 | svuint16_t y0 = svreinterpret_u16_s16( | |
146 | 5274 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); | |
147 | |||
148 | // x1 = x0 + 1, and clamp it too | ||
149 | 10548 | svuint16_t x1 = svreinterpret_u16_s16( | |
150 | 10548 | svmax_x(pg, svdup_n_s16(0), | |
151 | 5274 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); | |
152 | |||
153 | 10548 | svuint16_t y1 = svreinterpret_u16_s16( | |
154 | 10548 | svmax_x(pg, svdup_n_s16(0), | |
155 | 5274 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); | |
156 | 5274 | svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); | |
157 | 5274 | svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); | |
158 | |||
159 | // Calculate offsets from coordinates (y * stride + x) | ||
160 | 5274 | svuint32_t offsets_a_b = svmlalb_u32(svmovlb_u32(x0), y0, v_src_stride_); | |
161 | 5274 | svuint32_t offsets_a_t = svmlalt_u32(svmovlt_u32(x0), y0, v_src_stride_); | |
162 | 5274 | svuint32_t offsets_b_b = svmlalb_u32(svmovlb_u32(x1), y0, v_src_stride_); | |
163 | 5274 | svuint32_t offsets_b_t = svmlalt_u32(svmovlt_u32(x1), y0, v_src_stride_); | |
164 | 5274 | svuint32_t offsets_c_b = svmlalb_u32(svmovlb_u32(x0), y1, v_src_stride_); | |
165 | 5274 | svuint32_t offsets_c_t = svmlalt_u32(svmovlt_u32(x0), y1, v_src_stride_); | |
166 | 5274 | svuint32_t offsets_d_b = svmlalb_u32(svmovlb_u32(x1), y1, v_src_stride_); | |
167 | 5274 | svuint32_t offsets_d_t = svmlalt_u32(svmovlt_u32(x1), y1, v_src_stride_); | |
168 | |||
169 | // Load pixels from source | ||
170 | 5274 | src_a = gather_load_src(pg_b, offsets_a_b, pg_t, offsets_a_t); | |
171 | 5274 | src_b = gather_load_src(pg_b, offsets_b_b, pg_t, offsets_b_t); | |
172 | 5274 | src_c = gather_load_src(pg_b, offsets_c_b, pg_t, offsets_c_t); | |
173 | 5274 | src_d = gather_load_src(pg_b, offsets_d_b, pg_t, offsets_d_t); | |
174 | 5274 | mapxy += step; | |
175 | 5274 | } | |
176 | |||
177 | 5274 | void interpolate_and_store(svbool_t pg, ptrdiff_t step, | |
178 | Columns<const uint16_t>& mapfrac, | ||
179 | Columns<ScalarType>& dst, svuint16_t src_a, | ||
180 | svuint16_t src_b, svuint16_t src_c, | ||
181 | svuint16_t src_d, svuint32_t bias) { | ||
182 | 5274 | FracVectorType frac = svld1_u16(pg, &mapfrac[0]); | |
183 | 10548 | svuint16_t result = interpolate_16point5<uint8_t>(pg, frac, src_a, src_b, | |
184 | 5274 | src_c, src_d, bias); | |
185 | 5274 | svst1b_u16(pg, &dst[0], result); | |
186 | 5274 | mapfrac += step; | |
187 | 5274 | dst += step; | |
188 | 5274 | } | |
189 | |||
190 | Rows<const ScalarType> src_rows_; | ||
191 | |||
192 | private: | ||
193 | svuint16_t& v_src_stride_; | ||
194 | MapVectorType& v_xmax_; | ||
195 | MapVectorType& v_ymax_; | ||
196 | }; // end of class RemapS16Point5Replicate<uint8_t> | ||
197 | |||
198 | template <> | ||
199 | class RemapS16Point5Replicate<uint16_t> { | ||
200 | public: | ||
201 | using ScalarType = uint16_t; | ||
202 | using MapVecTraits = VecTraits<int16_t>; | ||
203 | using MapVectorType = typename MapVecTraits::VectorType; | ||
204 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
205 | using FracVecTraits = VecTraits<uint16_t>; | ||
206 | using FracVectorType = typename FracVecTraits::VectorType; | ||
207 | |||
208 | 134 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
209 | size_t src_height, svuint16_t& v_src_stride, | ||
210 | MapVectorType& v_x_max, MapVectorType& v_y_max) | ||
211 | 134 | : src_rows_{src_rows}, | |
212 | 134 | v_src_element_stride_{v_src_stride}, | |
213 | 134 | v_xmax_{v_x_max}, | |
214 | 134 | v_ymax_{v_y_max} { | |
215 | 134 | v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); | |
216 | 134 | v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1)); | |
217 | 134 | v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1)); | |
218 | 134 | } | |
219 | |||
220 | 158 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
221 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
222 | 158 | svuint16_t src_a, src_b, src_c, src_d; | |
223 | |||
224 | 158 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
225 | 5132 | auto vector_path = [&](svbool_t pg, ptrdiff_t step) { | |
226 | 4974 | load_source(pg, step, mapxy, src_a, src_b, src_c, src_d); | |
227 | 9948 | interpolate_and_store(pg, step, mapfrac, dst, src_a, src_b, src_c, src_d, | |
228 | 4974 | bias); | |
229 | 4974 | }; | |
230 | |||
231 | 158 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
232 | 5058 | loop.unroll_once([&](size_t step) { | |
233 | 4900 | svbool_t pg = MapVecTraits::svptrue(); | |
234 | 4900 | vector_path(pg, static_cast<ptrdiff_t>(step)); | |
235 | 4900 | }); | |
236 | 232 | loop.remaining([&](size_t length, size_t step) { | |
237 | 74 | svbool_t pg = MapVecTraits::svwhilelt(step - length, step); | |
238 | 74 | vector_path(pg, static_cast<ptrdiff_t>(length)); | |
239 | 74 | }); | |
240 | 158 | } | |
241 | |||
242 | protected: | ||
243 | 19896 | svuint16_t gather_load_src(svbool_t pg_b, svuint32_t offsets_b, svbool_t pg_t, | |
244 | svuint32_t offsets_t) { | ||
245 | // Account for the size of the source type when calculating offset | ||
246 | 19896 | offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1); | |
247 | 19896 | offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1); | |
248 | |||
249 | 39792 | svuint32_t src_b = | |
250 | 19896 | svldnt1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
251 | 39792 | svuint32_t src_t = | |
252 | 19896 | svldnt1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
253 | 59688 | return svtrn1_u16(svreinterpret_u16_u32(src_b), | |
254 | 19896 | svreinterpret_u16_u32(src_t)); | |
255 | 19896 | } | |
256 | |||
257 | 4974 | void load_source(svbool_t pg, ptrdiff_t step, Columns<const int16_t>& mapxy, | |
258 | svuint16_t& src_a, svuint16_t& src_b, svuint16_t& src_c, | ||
259 | svuint16_t& src_d) { | ||
260 | 4974 | MapVector2Type xy = svld2_s16(pg, &mapxy[0]); | |
261 | |||
262 | // Clamp coordinates to within the dimensions of the source image | ||
263 | 9948 | svuint16_t x0 = svreinterpret_u16_s16( | |
264 | 4974 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); | |
265 | 9948 | svuint16_t y0 = svreinterpret_u16_s16( | |
266 | 4974 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); | |
267 | |||
268 | // x1 = x0 + 1, and clamp it too | ||
269 | 9948 | svuint16_t x1 = svreinterpret_u16_s16( | |
270 | 9948 | svmax_x(pg, svdup_n_s16(0), | |
271 | 4974 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); | |
272 | |||
273 | 9948 | svuint16_t y1 = svreinterpret_u16_s16( | |
274 | 9948 | svmax_x(pg, svdup_n_s16(0), | |
275 | 4974 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); | |
276 | 4974 | svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); | |
277 | 4974 | svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); | |
278 | |||
279 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
280 | 9948 | svuint32_t offsets_a_b = | |
281 | 4974 | svmlalb_u32(svmovlb_u32(x0), y0, v_src_element_stride_); | |
282 | 9948 | svuint32_t offsets_a_t = | |
283 | 4974 | svmlalt_u32(svmovlt_u32(x0), y0, v_src_element_stride_); | |
284 | 9948 | svuint32_t offsets_b_b = | |
285 | 4974 | svmlalb_u32(svmovlb_u32(x1), y0, v_src_element_stride_); | |
286 | 9948 | svuint32_t offsets_b_t = | |
287 | 4974 | svmlalt_u32(svmovlt_u32(x1), y0, v_src_element_stride_); | |
288 | 9948 | svuint32_t offsets_c_b = | |
289 | 4974 | svmlalb_u32(svmovlb_u32(x0), y1, v_src_element_stride_); | |
290 | 9948 | svuint32_t offsets_c_t = | |
291 | 4974 | svmlalt_u32(svmovlt_u32(x0), y1, v_src_element_stride_); | |
292 | 9948 | svuint32_t offsets_d_b = | |
293 | 4974 | svmlalb_u32(svmovlb_u32(x1), y1, v_src_element_stride_); | |
294 | 9948 | svuint32_t offsets_d_t = | |
295 | 4974 | svmlalt_u32(svmovlt_u32(x1), y1, v_src_element_stride_); | |
296 | |||
297 | // Load pixels from source | ||
298 | 4974 | src_a = gather_load_src(pg_b, offsets_a_b, pg_t, offsets_a_t); | |
299 | 4974 | src_b = gather_load_src(pg_b, offsets_b_b, pg_t, offsets_b_t); | |
300 | 4974 | src_c = gather_load_src(pg_b, offsets_c_b, pg_t, offsets_c_t); | |
301 | 4974 | src_d = gather_load_src(pg_b, offsets_d_b, pg_t, offsets_d_t); | |
302 | 4974 | mapxy += step; | |
303 | 4974 | } | |
304 | |||
305 | 4974 | void interpolate_and_store(svbool_t pg, ptrdiff_t step, | |
306 | Columns<const uint16_t>& mapfrac, | ||
307 | Columns<ScalarType>& dst, svuint16_t src_a, | ||
308 | svuint16_t src_b, svuint16_t src_c, | ||
309 | svuint16_t src_d, svuint32_t bias) { | ||
310 | 4974 | FracVectorType frac = svld1_u16(pg, &mapfrac[0]); | |
311 | 9948 | svuint16_t result = interpolate_16point5<uint16_t>(pg, frac, src_a, src_b, | |
312 | 4974 | src_c, src_d, bias); | |
313 | 4974 | svst1_u16(pg, &dst[0], result); | |
314 | 4974 | mapfrac += step; | |
315 | 4974 | dst += step; | |
316 | 4974 | } | |
317 | |||
318 | Rows<const ScalarType> src_rows_; | ||
319 | |||
320 | private: | ||
321 | svuint16_t& v_src_element_stride_; | ||
322 | MapVectorType& v_xmax_; | ||
323 | MapVectorType& v_ymax_; | ||
324 | }; // end of class RemapS16Point5Replicate<uint16_t> | ||
325 | |||
326 | template <typename ScalarType> | ||
327 | class RemapS16Point5ConstantBorder; | ||
328 | |||
329 | template <> | ||
330 | class RemapS16Point5ConstantBorder<uint8_t> { | ||
331 | public: | ||
332 | using ScalarType = uint8_t; | ||
333 | |||
334 | 132 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
335 | size_t src_width, size_t src_height, | ||
336 | const ScalarType* border_value, | ||
337 | svuint16_t& v_src_stride, svuint16_t& v_width, | ||
338 | svuint16_t& v_height, svuint16_t& v_border) | ||
339 | 132 | : src_rows_{src_rows}, | |
340 | 132 | v_src_stride_{v_src_stride}, | |
341 | 132 | v_width_{v_width}, | |
342 | 132 | v_height_{v_height}, | |
343 | 132 | v_border_{v_border} { | |
344 | 132 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
345 | 132 | v_width_ = svdup_u16(static_cast<uint16_t>(src_width)); | |
346 | 132 | v_height_ = svdup_u16(static_cast<uint16_t>(src_height)); | |
347 | 132 | v_border_ = svdup_u16(*border_value); | |
348 | 132 | } | |
349 | |||
350 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
351 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
352 | 156 | svuint16_t one = svdup_n_u16(1); | |
353 | 156 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
354 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 5272 times.
|
5428 | for (size_t i = 0; i < width; i += svcnth()) { |
355 | 5272 | svbool_t pg = svwhilelt_b16_u64(i, width); | |
356 | |||
357 | 10544 | svuint16x2_t xy = | |
358 | 10544 | svld2_u16(pg, reinterpret_cast<const uint16_t*>( | |
359 | 5272 | &mapxy[static_cast<ptrdiff_t>(i * 2)])); | |
360 | |||
361 | 5272 | svuint16_t x0 = svget2(xy, 0); | |
362 | 5272 | svuint16_t y0 = svget2(xy, 1); | |
363 | 5272 | svuint16_t x1 = svadd_x(pg, x0, one); | |
364 | 5272 | svuint16_t y1 = svadd_x(pg, y0, one); | |
365 | |||
366 | 10544 | svuint16_t v00 = load_pixels_or_constant_border( | |
367 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y0); | |
368 | 10544 | svuint16_t v01 = load_pixels_or_constant_border( | |
369 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y1); | |
370 | 10544 | svuint16_t v10 = load_pixels_or_constant_border( | |
371 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y0); | |
372 | 10544 | svuint16_t v11 = load_pixels_or_constant_border( | |
373 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y1); | |
374 | |||
375 | 5272 | svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast<ptrdiff_t>(i)]); | |
376 | 10544 | svuint16_t result = | |
377 | 5272 | interpolate_16point5<uint8_t>(pg, frac, v00, v10, v01, v11, bias); | |
378 | |||
379 | 5272 | svst1b_u16(pg, &dst[static_cast<ptrdiff_t>(i)], result); | |
380 | 5272 | } | |
381 | 156 | } | |
382 | |||
383 | private: | ||
384 | 21088 | svuint16_t load_pixels_or_constant_border(Rows<const ScalarType> src_rows_, | |
385 | svuint16_t& v_src_stride_, | ||
386 | svuint16_t& v_width_, | ||
387 | svuint16_t& v_height_, | ||
388 | svuint16_t& v_border_, svbool_t pg, | ||
389 | svuint16_t x, svuint16_t y) { | ||
390 | // Find whether coordinates are within the image dimensions. | ||
391 | 42176 | svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), | |
392 | 21088 | svcmplt_u16(pg, y, v_height_)); | |
393 | |||
394 | // Calculate offsets from coordinates (y * stride + x) | ||
395 | 21088 | svuint32_t offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_stride_); | |
396 | 21088 | svuint32_t offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_stride_); | |
397 | |||
398 | 21088 | svbool_t pg_b = in_range; | |
399 | 21088 | svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); | |
400 | |||
401 | // Copy pixels from source | ||
402 | 42176 | svuint32_t result_b = | |
403 | 21088 | svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
404 | 42176 | svuint32_t result_t = | |
405 | 21088 | svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
406 | |||
407 | 42176 | svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), | |
408 | 21088 | svreinterpret_u16_u32(result_t)); | |
409 | |||
410 | 42176 | return svsel(in_range, result, v_border_); | |
411 | 21088 | } | |
412 | |||
413 | Rows<const ScalarType> src_rows_; | ||
414 | svuint16_t& v_src_stride_; | ||
415 | svuint16_t& v_width_; | ||
416 | svuint16_t& v_height_; | ||
417 | svuint16_t& v_border_; | ||
418 | }; // end of class RemapS16Point5ConstantBorder<uint8_t> | ||
419 | |||
420 | template <> | ||
421 | class RemapS16Point5ConstantBorder<uint16_t> { | ||
422 | public: | ||
423 | using ScalarType = uint16_t; | ||
424 | |||
425 | 132 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
426 | size_t src_width, size_t src_height, | ||
427 | const ScalarType* border_value, | ||
428 | svuint16_t& v_src_stride, svuint16_t& v_width, | ||
429 | svuint16_t& v_height, svuint16_t& v_border) | ||
430 | 132 | : src_rows_{src_rows}, | |
431 | 132 | v_src_element_stride_{v_src_stride}, | |
432 | 132 | v_width_{v_width}, | |
433 | 132 | v_height_{v_height}, | |
434 | 132 | v_border_{v_border} { | |
435 | 132 | v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); | |
436 | 132 | v_width_ = svdup_u16(static_cast<uint16_t>(src_width)); | |
437 | 132 | v_height_ = svdup_u16(static_cast<uint16_t>(src_height)); | |
438 | 132 | v_border_ = svdup_u16(*border_value); | |
439 | 132 | } | |
440 | |||
441 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
442 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
443 | 156 | svuint16_t one = svdup_n_u16(1); | |
444 | 156 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
445 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 4972 times.
|
5128 | for (size_t i = 0; i < width; i += svcnth()) { |
446 | 4972 | svbool_t pg = svwhilelt_b16_u64(i, width); | |
447 | |||
448 | 9944 | svuint16x2_t xy = | |
449 | 9944 | svld2_u16(pg, reinterpret_cast<const uint16_t*>( | |
450 | 4972 | &mapxy[static_cast<ptrdiff_t>(i * 2)])); | |
451 | |||
452 | 4972 | svuint16_t x0 = svget2(xy, 0); | |
453 | 4972 | svuint16_t y0 = svget2(xy, 1); | |
454 | 4972 | svuint16_t x1 = svadd_x(pg, x0, one); | |
455 | 4972 | svuint16_t y1 = svadd_x(pg, y0, one); | |
456 | |||
457 | 9944 | svuint16_t v00 = load_pixels_or_constant_border( | |
458 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
459 | 4972 | x0, y0); | |
460 | 9944 | svuint16_t v01 = load_pixels_or_constant_border( | |
461 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
462 | 4972 | x0, y1); | |
463 | 9944 | svuint16_t v10 = load_pixels_or_constant_border( | |
464 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
465 | 4972 | x1, y0); | |
466 | 9944 | svuint16_t v11 = load_pixels_or_constant_border( | |
467 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
468 | 4972 | x1, y1); | |
469 | |||
470 | 4972 | svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast<ptrdiff_t>(i)]); | |
471 | 9944 | svuint16_t result = | |
472 | 4972 | interpolate_16point5<uint16_t>(pg, frac, v00, v10, v01, v11, bias); | |
473 | |||
474 | 4972 | svst1_u16(pg, &dst[static_cast<ptrdiff_t>(i)], result); | |
475 | 4972 | } | |
476 | 156 | } | |
477 | |||
478 | private: | ||
479 | 19888 | svuint16_t load_pixels_or_constant_border(Rows<const ScalarType> src_rows_, | |
480 | svuint16_t& v_src_element_stride_, | ||
481 | svuint16_t& v_width_, | ||
482 | svuint16_t& v_height_, | ||
483 | svuint16_t& v_border_, svbool_t pg, | ||
484 | svuint16_t x, svuint16_t y) { | ||
485 | // Find whether coordinates are within the image dimensions. | ||
486 | 39776 | svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), | |
487 | 19888 | svcmplt_u16(pg, y, v_height_)); | |
488 | |||
489 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
490 | 39776 | svuint32_t offsets_b = | |
491 | 19888 | svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); | |
492 | 39776 | svuint32_t offsets_t = | |
493 | 19888 | svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); | |
494 | |||
495 | 19888 | svbool_t pg_b = in_range; | |
496 | 19888 | svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); | |
497 | |||
498 | // Account for the size of the source type when calculating offset | ||
499 | 19888 | offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1); | |
500 | 19888 | offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1); | |
501 | |||
502 | // Copy pixels from source | ||
503 | 39776 | svuint32_t result_b = | |
504 | 19888 | svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
505 | 39776 | svuint32_t result_t = | |
506 | 19888 | svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
507 | |||
508 | 39776 | svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), | |
509 | 19888 | svreinterpret_u16_u32(result_t)); | |
510 | |||
511 | 39776 | return svsel(in_range, result, v_border_); | |
512 | 19888 | } | |
513 | |||
514 | Rows<const ScalarType> src_rows_; | ||
515 | svuint16_t& v_src_element_stride_; | ||
516 | svuint16_t& v_width_; | ||
517 | svuint16_t& v_height_; | ||
518 | svuint16_t& v_border_; | ||
519 | }; // end of class RemapS16Point5ConstantBorder<uint16_t> | ||
520 | |||
521 | template <typename ScalarType> | ||
522 | class RemapS16Point5Replicate4ch; | ||
523 | |||
524 | template <> | ||
525 | class RemapS16Point5Replicate4ch<uint8_t> { | ||
526 | public: | ||
527 | using ScalarType = uint8_t; | ||
528 | using MapVecTraits = VecTraits<int16_t>; | ||
529 | using MapVectorType = typename MapVecTraits::VectorType; | ||
530 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
531 | using FracVecTraits = VecTraits<uint16_t>; | ||
532 | using FracVectorType = typename FracVecTraits::VectorType; | ||
533 | |||
534 | 132 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
535 | size_t src_height, svuint16_t& v_src_stride, | ||
536 | MapVectorType& v_x_max, MapVectorType& v_y_max) | ||
537 | 132 | : src_rows_{src_rows}, | |
538 | 132 | v_src_stride_{v_src_stride}, | |
539 | 132 | v_xmax_{v_x_max}, | |
540 | 132 | v_ymax_{v_y_max} { | |
541 | 132 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
542 | 132 | v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1)); | |
543 | 132 | v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1)); | |
544 | 132 | } | |
545 | |||
546 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
547 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
548 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
549 | 5354 | loop.unroll_once([&](size_t step) { | |
550 | 5198 | svbool_t pg = MapVecTraits::svptrue(); | |
551 | 5198 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(step)); | |
552 | 5198 | }); | |
553 | 230 | loop.remaining([&](size_t length, size_t step) { | |
554 | 74 | svbool_t pg = MapVecTraits::svwhilelt(step - length, step); | |
555 | 74 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(length)); | |
556 | 74 | }); | |
557 | 156 | } | |
558 | |||
559 | 5272 | void vector_path(svbool_t pg, Columns<const int16_t>& mapxy, | |
560 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
561 | ptrdiff_t step) { | ||
562 | 5272 | MapVector2Type xy = svld2_s16(pg, &mapxy[0]); | |
563 | 5272 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
564 | |||
565 | // Clamp coordinates to within the dimensions of the source image | ||
566 | 10544 | svuint16_t x0 = svreinterpret_u16_s16( | |
567 | 5272 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); | |
568 | 10544 | svuint16_t y0 = svreinterpret_u16_s16( | |
569 | 5272 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); | |
570 | |||
571 | // x1 = x0 + 1, and clamp it too | ||
572 | 10544 | svuint16_t x1 = svreinterpret_u16_s16( | |
573 | 10544 | svmax_x(pg, svdup_n_s16(0), | |
574 | 5272 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); | |
575 | |||
576 | 10544 | svuint16_t y1 = svreinterpret_u16_s16( | |
577 | 10544 | svmax_x(pg, svdup_n_s16(0), | |
578 | 5272 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); | |
579 | 5272 | svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); | |
580 | 5272 | svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); | |
581 | |||
582 | // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 | ||
583 | // channels | ||
584 | 26360 | auto load_4ch_b = [&](svuint16_t x, svuint16_t y) { | |
585 | 21088 | return svreinterpret_u8_u32(svld1_gather_u32offset_u32( | |
586 | 21088 | pg_b, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
587 | 21088 | svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_))); | |
588 | }; | ||
589 | 26360 | auto load_4ch_t = [&](svuint16_t x, svuint16_t y) { | |
590 | 21088 | return svreinterpret_u8_u32(svld1_gather_u32offset_u32( | |
591 | 21088 | pg_t, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
592 | 21088 | svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_))); | |
593 | }; | ||
594 | |||
595 | 5272 | FracVectorType frac = svld1_u16(pg, &mapfrac[0]); | |
596 | 10544 | svuint16_t xfrac = | |
597 | 5272 | svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
598 | 10544 | svuint16_t yfrac = | |
599 | 10544 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
600 | 5272 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
601 | |||
602 | 26360 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
603 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
604 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
605 | 42176 | svuint16_t line0 = svmla_x( | |
606 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); | |
607 | 42176 | svuint16_t line1 = svmla_x( | |
608 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); | |
609 | |||
610 | 21088 | svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); | |
611 | 21088 | svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); | |
612 | 21088 | acc_b = svmlalb_u32(acc_b, line1, yfrac); | |
613 | 21088 | acc_t = svmlalt_u32(acc_t, line1, yfrac); | |
614 | |||
615 | 42176 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
616 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
617 | 21088 | }; | |
618 | |||
619 | // bottom part | ||
620 | 5272 | svuint8_t a = load_4ch_b(x0, y0); | |
621 | 5272 | svuint8_t b = load_4ch_b(x1, y0); | |
622 | 5272 | svuint8_t c = load_4ch_b(x0, y1); | |
623 | 5272 | svuint8_t d = load_4ch_b(x1, y1); | |
624 | // from xfrac, we need the bottom part twice | ||
625 | 5272 | svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); | |
626 | 10544 | svuint16_t nxfrac2b = svsub_u16_x( | |
627 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); | |
628 | 5272 | svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); | |
629 | 10544 | svuint16_t nyfrac2b = svsub_u16_x( | |
630 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); | |
631 | |||
632 | // a,b,c,d looks like 12341234...(four channels) | ||
633 | // bottom is 1313... | ||
634 | 10544 | svuint16_t res_bb = | |
635 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), | |
636 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
637 | // top is 2424... | ||
638 | 10544 | svuint16_t res_bt = | |
639 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), | |
640 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
641 | 10544 | svuint8_t res_b = | |
642 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); | |
643 | |||
644 | // top part | ||
645 | 5272 | a = load_4ch_t(x0, y0); | |
646 | 5272 | b = load_4ch_t(x1, y0); | |
647 | 5272 | c = load_4ch_t(x0, y1); | |
648 | 5272 | d = load_4ch_t(x1, y1); | |
649 | // from xfrac, we need the top part twice | ||
650 | 5272 | svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); | |
651 | 10544 | svuint16_t nxfrac2t = svsub_u16_x( | |
652 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); | |
653 | 5272 | svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); | |
654 | 10544 | svuint16_t nyfrac2t = svsub_u16_x( | |
655 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); | |
656 | |||
657 | // a,b,c,d looks like 12341234...(four channels) | ||
658 | // bottom is 1313... | ||
659 | 10544 | svuint16_t res_tb = | |
660 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), | |
661 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
662 | // top is 2424... | ||
663 | 10544 | svuint16_t res_tt = | |
664 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), | |
665 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
666 | 10544 | svuint8_t res_t = | |
667 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); | |
668 | |||
669 | 5272 | svbool_t pg_low = svwhilelt_b32_u64(0L, static_cast<size_t>(step)); | |
670 | 5272 | svbool_t pg_high = svwhilelt_b32_u64(svcntw(), static_cast<size_t>(step)); | |
671 | 10544 | svuint32_t res_low = | |
672 | 5272 | svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
673 | 10544 | svuint32_t res_high = | |
674 | 5272 | svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
675 | 5272 | mapxy += step; | |
676 | 5272 | svst1_u32(pg_low, reinterpret_cast<uint32_t*>(&dst[0]), res_low); | |
677 | 10544 | svst1_u32(pg_high, reinterpret_cast<uint32_t*>(&dst[0]) + svcntw(), | |
678 | 5272 | res_high); | |
679 | 5272 | mapfrac += step; | |
680 | 5272 | dst += step; | |
681 | 5272 | } | |
682 | |||
683 | Rows<const ScalarType> src_rows_; | ||
684 | |||
685 | private: | ||
686 | svuint16_t& v_src_stride_; | ||
687 | MapVectorType& v_xmax_; | ||
688 | MapVectorType& v_ymax_; | ||
689 | }; // end of class RemapS16Point5Replicate4ch<uint8_t> | ||
690 | |||
691 | template <> | ||
692 | class RemapS16Point5Replicate4ch<uint16_t> { | ||
693 | public: | ||
694 | using ScalarType = uint16_t; | ||
695 | |||
696 | 132 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
697 | size_t src_height, svuint32_t& v_src_stride, | ||
698 | svint32_t& v_x_max, svint32_t& v_y_max) | ||
699 | 132 | : src_rows_{src_rows}, | |
700 | 132 | v_src_stride_{v_src_stride}, | |
701 | 132 | v_xmax_{v_x_max}, | |
702 | 132 | v_ymax_{v_y_max} { | |
703 | 132 | v_src_stride_ = svdup_u32(src_rows.stride()); | |
704 | 132 | v_xmax_ = svdup_s32(static_cast<int32_t>(src_width - 1)); | |
705 | 132 | v_ymax_ = svdup_s32(static_cast<int32_t>(src_height - 1)); | |
706 | 132 | } | |
707 | |||
708 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
709 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
710 | 156 | LoopUnroll loop{width, svcntw()}; | |
711 | 10006 | loop.unroll_once([&](size_t step) { | |
712 | 19700 | vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), svptrue_b64(), | |
713 | 9850 | svptrue_b64(), mapxy, mapfrac, dst, | |
714 | 9850 | static_cast<ptrdiff_t>(step)); | |
715 | 9850 | }); | |
716 | 216 | loop.remaining([&](size_t length, size_t step) { | |
717 | 60 | svbool_t pg = svwhilelt_b32_u64(step, step + length); | |
718 | 60 | svbool_t pg64_b = svtrn1_b32(pg, svpfalse()); | |
719 | 60 | svbool_t pg64_t = svtrn2_b32(pg, svpfalse()); | |
720 | 60 | svbool_t pg_low = svzip1_b32(pg, svpfalse()); | |
721 | 60 | svbool_t pg_high = svzip2_b32(pg, svpfalse()); | |
722 | 120 | vector_path(pg, pg64_b, pg64_t, pg_low, pg_high, mapxy, mapfrac, dst, | |
723 | 60 | static_cast<ptrdiff_t>(length)); | |
724 | 60 | }); | |
725 | 156 | } | |
726 | |||
727 | 9910 | void vector_path(svbool_t pg, svbool_t pg64_b, svbool_t pg64_t, | |
728 | svbool_t pg_low, svbool_t pg_high, | ||
729 | Columns<const int16_t>& mapxy, | ||
730 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
731 | ptrdiff_t step) { | ||
732 | // Load one vector of xy: even coordinates are x, odd are y | ||
733 | 19820 | svint16_t xy = svreinterpret_s16_u32( | |
734 | 9910 | svld1_u32(pg, reinterpret_cast<const uint32_t*>(&mapxy[0]))); | |
735 | 9910 | svint32_t x = svmovlb(xy); | |
736 | 9910 | svint32_t y = svmovlt(xy); | |
737 | // Clamp coordinates to within the dimensions of the source image | ||
738 | 19820 | svuint32_t x0 = svreinterpret_u32_s32( | |
739 | 9910 | svmax_x(pg, svdup_n_s32(0), svmin_x(pg, x, v_xmax_))); | |
740 | 19820 | svuint32_t y0 = svreinterpret_u32_s32( | |
741 | 9910 | svmax_x(pg, svdup_n_s32(0), svmin_x(pg, y, v_ymax_))); | |
742 | |||
743 | // x1 = x0 + 1, and clamp it too | ||
744 | 19820 | svuint32_t x1 = svreinterpret_u32_s32(svmax_x( | |
745 | 9910 | pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, x, 1), v_xmax_))); | |
746 | 19820 | svuint32_t y1 = svreinterpret_u32_s32(svmax_x( | |
747 | 9910 | pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, y, 1), v_ymax_))); | |
748 | |||
749 | 89190 | auto load_4ch = [&](svbool_t pg, svuint64_t offsets) { | |
750 | 79280 | return svreinterpret_u16_u64(svld1_gather_u64offset_u64( | |
751 | 79280 | pg, reinterpret_cast<const uint64_t*>(&src_rows_[0]), offsets)); | |
752 | }; | ||
753 | |||
754 | 9910 | svuint16_t xfrac, yfrac, nxfrac, nyfrac; | |
755 | { | ||
756 | // Fractions are loaded into even lanes | ||
757 | 9910 | svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0])); | |
758 | |||
759 | // Fractions are doubled, 00112233... (will be doubled again later) | ||
760 | 9910 | svuint16_t frac = svtrn1(rawfrac, rawfrac); | |
761 | |||
762 | 9910 | xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
763 | 19820 | yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
764 | 9910 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
765 | 9910 | nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
766 | 9910 | nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
767 | 9910 | } | |
768 | |||
769 | 9910 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
770 | |||
771 | 29730 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
772 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
773 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
774 | 19820 | svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); | |
775 | 19820 | svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); | |
776 | 19820 | svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); | |
777 | 19820 | svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); | |
778 | |||
779 | 39640 | svuint32_t acc_b = | |
780 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); | |
781 | 39640 | svuint32_t acc_t = | |
782 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); | |
783 | 19820 | acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); | |
784 | 19820 | acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); | |
785 | |||
786 | 39640 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
787 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
788 | 19820 | }; | |
789 | |||
790 | // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit) | ||
791 | // Calculation is done in 2 parts, top and bottom | ||
792 | 9910 | svuint16_t res_b, res_t; | |
793 | |||
794 | { // bottom | ||
795 | 9910 | svuint64_t x0w = svshllb_n_u64(x0, 3); | |
796 | 9910 | svuint64_t x1w = svshllb_n_u64(x1, 3); | |
797 | 9910 | svuint64_t ys0w = svmullb_u64(y0, v_src_stride_); | |
798 | 9910 | svuint64_t ys1w = svmullb_u64(y1, v_src_stride_); | |
799 | 9910 | svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w); | |
800 | 9910 | svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w); | |
801 | 9910 | svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w); | |
802 | 9910 | svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w); | |
803 | |||
804 | 9910 | svuint16_t a = load_4ch(pg64_b, offsets_a); | |
805 | 9910 | svuint16_t b = load_4ch(pg64_b, offsets_b); | |
806 | 9910 | svuint16_t c = load_4ch(pg64_b, offsets_c); | |
807 | 9910 | svuint16_t d = load_4ch(pg64_b, offsets_d); | |
808 | |||
809 | // Copy even lanes twice -> 000022224444... these are the "bottom" | ||
810 | // fractions | ||
811 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32( | |
812 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
813 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32( | |
814 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
815 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32( | |
816 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
817 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32( | |
818 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
819 | |||
820 | 9910 | res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
821 | 9910 | } | |
822 | |||
823 | { // top | ||
824 | 9910 | svuint64_t x0w = svshllt_n_u64(x0, 3); | |
825 | 9910 | svuint64_t x1w = svshllt_n_u64(x1, 3); | |
826 | 9910 | svuint64_t ys0w = svmullt_u64(y0, v_src_stride_); | |
827 | 9910 | svuint64_t ys1w = svmullt_u64(y1, v_src_stride_); | |
828 | 9910 | svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w); | |
829 | 9910 | svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w); | |
830 | 9910 | svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w); | |
831 | 9910 | svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w); | |
832 | |||
833 | 9910 | svuint16_t a = load_4ch(pg64_t, offsets_a); | |
834 | 9910 | svuint16_t b = load_4ch(pg64_t, offsets_b); | |
835 | 9910 | svuint16_t c = load_4ch(pg64_t, offsets_c); | |
836 | 9910 | svuint16_t d = load_4ch(pg64_t, offsets_d); | |
837 | |||
838 | // Copy odd lanes twice -> 111133335555... these are the "top" | ||
839 | // fractions | ||
840 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32( | |
841 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
842 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32( | |
843 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
844 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32( | |
845 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
846 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32( | |
847 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
848 | |||
849 | 9910 | res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
850 | 9910 | } | |
851 | |||
852 | 19820 | svuint64_t res_low = | |
853 | 9910 | svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
854 | 19820 | svuint64_t res_high = | |
855 | 9910 | svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
856 | 9910 | svst1_u64(pg_low, reinterpret_cast<uint64_t*>(&dst[0]), res_low); | |
857 | 19820 | svst1_u64(pg_high, reinterpret_cast<uint64_t*>(&dst[0]) + svcntd(), | |
858 | 9910 | res_high); | |
859 | 9910 | mapxy += step; | |
860 | 9910 | mapfrac += step; | |
861 | 9910 | dst += step; | |
862 | 9910 | } | |
863 | |||
864 | Rows<const ScalarType> src_rows_; | ||
865 | |||
866 | private: | ||
867 | svuint32_t& v_src_stride_; | ||
868 | svint32_t& v_xmax_; | ||
869 | svint32_t& v_ymax_; | ||
870 | }; // end of class RemapS16Point5Replicate4ch<uint16_t> | ||
871 | |||
872 | template <typename ScalarType> | ||
873 | class RemapS16Point5Constant4ch; | ||
874 | |||
875 | template <> | ||
876 | class RemapS16Point5Constant4ch<uint8_t> { | ||
877 | public: | ||
878 | using ScalarType = uint8_t; | ||
879 | |||
880 | 132 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
881 | size_t src_height, const ScalarType* border_value, | ||
882 | svuint16_t& v_src_stride, svuint16_t& v_x_max, | ||
883 | svuint16_t& v_y_max, svuint32_t& v_border) | ||
884 | 132 | : src_rows_{src_rows}, | |
885 | 132 | v_src_stride_{v_src_stride}, | |
886 | 132 | v_xmax_{v_x_max}, | |
887 | 132 | v_ymax_{v_y_max}, | |
888 | 132 | v_border_{v_border} { | |
889 | 132 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
890 | 132 | v_xmax_ = svdup_u16(static_cast<uint16_t>(src_width - 1)); | |
891 | 132 | v_ymax_ = svdup_u16(static_cast<uint16_t>(src_height - 1)); | |
892 | 132 | uint32_t border_value_u32{}; | |
893 | 132 | memcpy(&border_value_u32, border_value, sizeof(uint32_t)); | |
894 | 132 | v_border_ = svdup_u32(border_value_u32); | |
895 | 132 | } | |
896 | |||
897 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
898 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
899 | 156 | LoopUnroll loop{width, svcnth()}; | |
900 | 5354 | loop.unroll_once([&](size_t step) { | |
901 | 5198 | svbool_t pg = svptrue_b16(); | |
902 | 5198 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(step)); | |
903 | 5198 | }); | |
904 | 230 | loop.remaining([&](size_t length, size_t step) { | |
905 | 74 | svbool_t pg = svwhilelt_b16_u64(step - length, step); | |
906 | 74 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(length)); | |
907 | 74 | }); | |
908 | 156 | } | |
909 | |||
910 | 5272 | void vector_path(svbool_t pg, Columns<const int16_t>& mapxy, | |
911 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
912 | ptrdiff_t step) { | ||
913 | 10544 | svuint16x2_t xy = | |
914 | 5272 | svld2_u16(pg, reinterpret_cast<const uint16_t*>(&mapxy[0])); | |
915 | 5272 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
916 | |||
917 | // Negative values become big positive ones | ||
918 | 5272 | svuint16_t x0 = svget2(xy, 0); | |
919 | 5272 | svuint16_t y0 = svget2(xy, 1); | |
920 | 5272 | svuint16_t x1 = svadd_n_u16_x(pg, x0, 1); | |
921 | 5272 | svuint16_t y1 = svadd_n_u16_x(pg, y0, 1); | |
922 | |||
923 | // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 | ||
924 | // channels | ||
925 | 26360 | auto load_4ch_or_border_b = [&](svuint16_t x, svuint16_t y) { | |
926 | 42176 | svbool_t in_range_b16 = | |
927 | 21088 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
928 | 21088 | svbool_t in_range = svtrn1_b16(in_range_b16, svpfalse()); | |
929 | 42176 | svuint32_t image = svld1_gather_u32offset_u32( | |
930 | 21088 | in_range, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
931 | 21088 | svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_)); | |
932 | 42176 | return svreinterpret_u8_u32(svsel(in_range, image, v_border_)); | |
933 | 21088 | }; | |
934 | 26360 | auto load_4ch_or_border_t = [&](svuint16_t x, svuint16_t y) { | |
935 | 42176 | svbool_t in_range_b16 = | |
936 | 21088 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
937 | 21088 | svbool_t in_range = svtrn2_b16(in_range_b16, svpfalse()); | |
938 | 42176 | svuint32_t image = svld1_gather_u32offset_u32( | |
939 | 21088 | in_range, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
940 | 21088 | svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_)); | |
941 | 42176 | return svreinterpret_u8_u32(svsel(in_range, image, v_border_)); | |
942 | 21088 | }; | |
943 | |||
944 | 5272 | svuint16_t frac = svld1_u16(pg, &mapfrac[0]); | |
945 | 10544 | svuint16_t xfrac = | |
946 | 5272 | svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
947 | 10544 | svuint16_t yfrac = | |
948 | 10544 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
949 | 5272 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
950 | |||
951 | 26360 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
952 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
953 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
954 | 42176 | svuint16_t line0 = svmla_x( | |
955 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); | |
956 | 42176 | svuint16_t line1 = svmla_x( | |
957 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); | |
958 | |||
959 | 21088 | svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); | |
960 | 21088 | svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); | |
961 | 21088 | acc_b = svmlalb_u32(acc_b, line1, yfrac); | |
962 | 21088 | acc_t = svmlalt_u32(acc_t, line1, yfrac); | |
963 | |||
964 | 42176 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
965 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
966 | 21088 | }; | |
967 | |||
968 | // bottom part | ||
969 | 5272 | svuint8_t a = load_4ch_or_border_b(x0, y0); | |
970 | 5272 | svuint8_t b = load_4ch_or_border_b(x1, y0); | |
971 | 5272 | svuint8_t c = load_4ch_or_border_b(x0, y1); | |
972 | 5272 | svuint8_t d = load_4ch_or_border_b(x1, y1); | |
973 | // from xfrac, we need the bottom part twice | ||
974 | 5272 | svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); | |
975 | 10544 | svuint16_t nxfrac2b = svsub_u16_x( | |
976 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); | |
977 | 5272 | svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); | |
978 | 10544 | svuint16_t nyfrac2b = svsub_u16_x( | |
979 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); | |
980 | |||
981 | // a,b,c,d looks like 12341234...(four channels) | ||
982 | // bottom is 1313... | ||
983 | 10544 | svuint16_t res_bb = | |
984 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), | |
985 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
986 | // top is 2424... | ||
987 | 10544 | svuint16_t res_bt = | |
988 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), | |
989 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
990 | 10544 | svuint8_t res_b = | |
991 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); | |
992 | |||
993 | // top part | ||
994 | 5272 | a = load_4ch_or_border_t(x0, y0); | |
995 | 5272 | b = load_4ch_or_border_t(x1, y0); | |
996 | 5272 | c = load_4ch_or_border_t(x0, y1); | |
997 | 5272 | d = load_4ch_or_border_t(x1, y1); | |
998 | // from xfrac, we need the top part twice | ||
999 | 5272 | svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); | |
1000 | 10544 | svuint16_t nxfrac2t = svsub_u16_x( | |
1001 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); | |
1002 | 5272 | svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); | |
1003 | 10544 | svuint16_t nyfrac2t = svsub_u16_x( | |
1004 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); | |
1005 | |||
1006 | // a,b,c,d looks like 12341234...(four channels) | ||
1007 | // bottom is 1313... | ||
1008 | 10544 | svuint16_t res_tb = | |
1009 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), | |
1010 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
1011 | // top is 2424... | ||
1012 | 10544 | svuint16_t res_tt = | |
1013 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), | |
1014 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
1015 | 10544 | svuint8_t res_t = | |
1016 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); | |
1017 | |||
1018 | 5272 | svbool_t pg_low = svwhilelt_b32_u64(0L, static_cast<size_t>(step)); | |
1019 | 5272 | svbool_t pg_high = svwhilelt_b32_u64(svcntw(), static_cast<size_t>(step)); | |
1020 | 10544 | svuint32_t res_low = | |
1021 | 5272 | svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
1022 | 10544 | svuint32_t res_high = | |
1023 | 5272 | svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
1024 | 5272 | mapxy += step; | |
1025 | 5272 | svst1_u32(pg_low, reinterpret_cast<uint32_t*>(&dst[0]), res_low); | |
1026 | 10544 | svst1_u32(pg_high, reinterpret_cast<uint32_t*>(&dst[0]) + svcntw(), | |
1027 | 5272 | res_high); | |
1028 | 5272 | mapfrac += step; | |
1029 | 5272 | dst += step; | |
1030 | 5272 | } | |
1031 | |||
1032 | Rows<const ScalarType> src_rows_; | ||
1033 | |||
1034 | private: | ||
1035 | svuint16_t& v_src_stride_; | ||
1036 | svuint16_t& v_xmax_; | ||
1037 | svuint16_t& v_ymax_; | ||
1038 | svuint32_t& v_border_; | ||
1039 | }; // end of class RemapS16Point5Constant4ch<uint8_t> | ||
1040 | |||
1041 | template <> | ||
1042 | class RemapS16Point5Constant4ch<uint16_t> { | ||
1043 | public: | ||
1044 | using ScalarType = uint16_t; | ||
1045 | |||
1046 | 132 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
1047 | size_t src_height, const ScalarType* border_value, | ||
1048 | svuint32_t& v_src_stride, svuint32_t& v_x_max, | ||
1049 | svuint32_t& v_y_max, svuint64_t& v_border) | ||
1050 | 132 | : src_rows_{src_rows}, | |
1051 | 132 | v_src_stride_{v_src_stride}, | |
1052 | 132 | v_xmax_{v_x_max}, | |
1053 | 132 | v_ymax_{v_y_max}, | |
1054 | 132 | v_border_{v_border} { | |
1055 | 132 | v_src_stride_ = svdup_u32(src_rows.stride()); | |
1056 | 132 | v_xmax_ = svdup_u32(static_cast<uint32_t>(src_width - 1)); | |
1057 | 132 | v_ymax_ = svdup_u32(static_cast<uint32_t>(src_height - 1)); | |
1058 | 132 | uint64_t border_value_u64{}; | |
1059 | 132 | memcpy(&border_value_u64, border_value, sizeof(uint64_t)); | |
1060 | 132 | v_border_ = svdup_u64(border_value_u64); | |
1061 | 132 | } | |
1062 | |||
1063 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
1064 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
1065 | 156 | LoopUnroll loop{width, svcntw()}; | |
1066 | 10006 | loop.unroll_once([&](size_t step) { | |
1067 | 19700 | vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), mapxy, mapfrac, | |
1068 | 9850 | dst, static_cast<ptrdiff_t>(step)); | |
1069 | 9850 | }); | |
1070 | 216 | loop.remaining([&](size_t length, size_t step) { | |
1071 | 60 | svbool_t pg = svwhilelt_b32_u64(step, step + length); | |
1072 | 60 | svbool_t pg_low = svzip1_b32(pg, svpfalse()); | |
1073 | 60 | svbool_t pg_high = svzip2_b32(pg, svpfalse()); | |
1074 | 120 | vector_path(pg, pg_low, pg_high, mapxy, mapfrac, dst, | |
1075 | 60 | static_cast<ptrdiff_t>(length)); | |
1076 | 60 | }); | |
1077 | 156 | } | |
1078 | |||
1079 | 9910 | void vector_path(svbool_t pg, svbool_t pg_low, svbool_t pg_high, | |
1080 | Columns<const int16_t>& mapxy, | ||
1081 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
1082 | ptrdiff_t step) { | ||
1083 | // Load one vector of xy: even coordinates are x, odd are y | ||
1084 | 19820 | svint16_t xy = svreinterpret_s16_u32( | |
1085 | 9910 | svld1_u32(pg, reinterpret_cast<const uint32_t*>(&mapxy[0]))); | |
1086 | |||
1087 | // Negative values become big positive ones | ||
1088 | // Widening is signed, so 16-bit -1 becomes 32-bit -1 | ||
1089 | 9910 | svuint32_t x0 = svreinterpret_u32_s32(svmovlb(xy)); | |
1090 | 9910 | svuint32_t y0 = svreinterpret_u32_s32(svmovlt(xy)); | |
1091 | 9910 | svuint32_t x1 = svadd_n_u32_x(pg, x0, 1); | |
1092 | 9910 | svuint32_t y1 = svadd_n_u32_x(pg, y0, 1); | |
1093 | |||
1094 | 49550 | auto load_4ch_or_border_b = [&](svuint32_t x, svuint32_t y) { | |
1095 | 79280 | svbool_t in_range_b32 = | |
1096 | 39640 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
1097 | 39640 | svbool_t in_range = svtrn1_b32(in_range_b32, svpfalse()); | |
1098 | 79280 | svuint64_t image = svld1_gather_u64offset_u64( | |
1099 | 39640 | in_range, reinterpret_cast<const uint64_t*>(&src_rows_[0]), | |
1100 | 39640 | svmlalb_u64(svshllb_n_u64(x, 3), y, v_src_stride_)); | |
1101 | 79280 | return svreinterpret_u16_u64(svsel(in_range, image, v_border_)); | |
1102 | 39640 | }; | |
1103 | |||
1104 | 49550 | auto load_4ch_or_border_t = [&](svuint32_t x, svuint32_t y) { | |
1105 | 79280 | svbool_t in_range_b32 = | |
1106 | 39640 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
1107 | 39640 | svbool_t in_range = svtrn2_b32(in_range_b32, svpfalse()); | |
1108 | 79280 | svuint64_t image = svld1_gather_u64offset_u64( | |
1109 | 39640 | in_range, reinterpret_cast<const uint64_t*>(&src_rows_[0]), | |
1110 | 39640 | svmlalt_u64(svshllt_n_u64(x, 3), y, v_src_stride_)); | |
1111 | 79280 | return svreinterpret_u16_u64(svsel(in_range, image, v_border_)); | |
1112 | 39640 | }; | |
1113 | |||
1114 | 9910 | svuint16_t xfrac, yfrac, nxfrac, nyfrac; | |
1115 | { | ||
1116 | // Fractions are loaded into even lanes | ||
1117 | 9910 | svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0])); | |
1118 | |||
1119 | // Fractions are doubled, 00112233... (will be doubled again later) | ||
1120 | 9910 | svuint16_t frac = svtrn1(rawfrac, rawfrac); | |
1121 | |||
1122 | 9910 | xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
1123 | 19820 | yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
1124 | 9910 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
1125 | 9910 | nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
1126 | 9910 | nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
1127 | 9910 | } | |
1128 | |||
1129 | 9910 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
1130 | |||
1131 | 29730 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
1132 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
1133 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
1134 | 19820 | svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); | |
1135 | 19820 | svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); | |
1136 | 19820 | svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); | |
1137 | 19820 | svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); | |
1138 | |||
1139 | 39640 | svuint32_t acc_b = | |
1140 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); | |
1141 | 39640 | svuint32_t acc_t = | |
1142 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); | |
1143 | 19820 | acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); | |
1144 | 19820 | acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); | |
1145 | |||
1146 | 39640 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
1147 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
1148 | 19820 | }; | |
1149 | |||
1150 | // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit) | ||
1151 | // Calculation is done in 2 parts, top and bottom | ||
1152 | 9910 | svuint16_t res_b, res_t; | |
1153 | |||
1154 | { // bottom | ||
1155 | 9910 | svuint16_t a = load_4ch_or_border_b(x0, y0); | |
1156 | 9910 | svuint16_t b = load_4ch_or_border_b(x1, y0); | |
1157 | 9910 | svuint16_t c = load_4ch_or_border_b(x0, y1); | |
1158 | 9910 | svuint16_t d = load_4ch_or_border_b(x1, y1); | |
1159 | |||
1160 | // Copy even lanes twice -> 000022224444... these are the "bottom" | ||
1161 | // fractions | ||
1162 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32( | |
1163 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
1164 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32( | |
1165 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
1166 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32( | |
1167 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
1168 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32( | |
1169 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
1170 | |||
1171 | 9910 | res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
1172 | 9910 | } | |
1173 | |||
1174 | { // top | ||
1175 | 9910 | svuint16_t a = load_4ch_or_border_t(x0, y0); | |
1176 | 9910 | svuint16_t b = load_4ch_or_border_t(x1, y0); | |
1177 | 9910 | svuint16_t c = load_4ch_or_border_t(x0, y1); | |
1178 | 9910 | svuint16_t d = load_4ch_or_border_t(x1, y1); | |
1179 | |||
1180 | // Copy odd lanes twice -> 111133335555... these are the "top" | ||
1181 | // fractions | ||
1182 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32( | |
1183 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
1184 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32( | |
1185 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
1186 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32( | |
1187 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
1188 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32( | |
1189 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
1190 | |||
1191 | 9910 | res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
1192 | 9910 | } | |
1193 | |||
1194 | 19820 | svuint64_t res_low = | |
1195 | 9910 | svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
1196 | 19820 | svuint64_t res_high = | |
1197 | 9910 | svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
1198 | 9910 | svst1_u64(pg_low, reinterpret_cast<uint64_t*>(&dst[0]), res_low); | |
1199 | 19820 | svst1_u64(pg_high, reinterpret_cast<uint64_t*>(&dst[0]) + svcntd(), | |
1200 | 9910 | res_high); | |
1201 | 9910 | mapxy += step; | |
1202 | 9910 | mapfrac += step; | |
1203 | 9910 | dst += step; | |
1204 | 9910 | } | |
1205 | |||
1206 | Rows<const ScalarType> src_rows_; | ||
1207 | |||
1208 | private: | ||
1209 | svuint32_t& v_src_stride_; | ||
1210 | svuint32_t& v_xmax_; | ||
1211 | svuint32_t& v_ymax_; | ||
1212 | svuint64_t& v_border_; | ||
1213 | }; // end of class RemapS16Point5Constant4ch<uint16_t> | ||
1214 | |||
1215 | // Most of the complexity comes from parameter checking. | ||
1216 | // NOLINTBEGIN(readability-function-cognitive-complexity) | ||
1217 | template <typename T> | ||
1218 | 1140 | kleidicv_error_t remap_s16point5(const T* src, size_t src_stride, | |
1219 | size_t src_width, size_t src_height, T* dst, | ||
1220 | size_t dst_stride, size_t dst_width, | ||
1221 | size_t dst_height, size_t channels, | ||
1222 | const int16_t* mapxy, size_t mapxy_stride, | ||
1223 | const uint16_t* mapfrac, size_t mapfrac_stride, | ||
1224 | kleidicv_border_type_t border_type, | ||
1225 | const T* border_value) { | ||
1226 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 568 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 568 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 568 times.
|
1140 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
1227 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 566 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 566 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 566 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 566 times.
|
1136 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
1228 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 564 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 564 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 564 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 564 times.
|
1132 | CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); |
1229 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 562 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 562 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 562 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 562 times.
|
1128 | CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height); |
1230 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 556 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 556 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 560 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 556 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 556 times.
|
1124 | CHECK_IMAGE_SIZE(src_width, src_height); |
1231 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 554 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 552 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 552 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 554 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 552 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 552 times.
|
1112 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
1232 |
8/8✓ Branch 0 taken 268 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 268 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 266 times.
✓ Branch 7 taken 2 times.
|
1104 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { |
1233 | 4 | return KLEIDICV_ERROR_NULL_POINTER; | |
1234 | } | ||
1235 | |||
1236 |
8/8✓ Branch 0 taken 530 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 530 times.
✓ Branch 3 taken 20 times.
✓ Branch 4 taken 530 times.
✓ Branch 5 taken 20 times.
✓ Branch 6 taken 530 times.
✓ Branch 7 taken 20 times.
|
2200 | if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height, |
1237 | 1100 | dst_width, border_type, channels)) { | |
1238 | 40 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1239 | } | ||
1240 | |||
1241 | 1060 | Rows<const T> src_rows{src, src_stride, channels}; | |
1242 | 1060 | Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2}; | |
1243 | 1060 | Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1}; | |
1244 | 1060 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
1245 | 1060 | svuint16_t sv_src_stride; | |
1246 | 1060 | Rectangle rect{dst_width, dst_height}; | |
1247 | |||
1248 |
4/4✓ Branch 0 taken 266 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 264 times.
|
1060 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { |
1249 |
4/4✓ Branch 0 taken 132 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 132 times.
✓ Branch 3 taken 132 times.
|
528 | if (channels == 1) { |
1250 | 264 | svuint16_t sv_width, sv_height, sv_border; | |
1251 | 528 | RemapS16Point5ConstantBorder<T> operation{ | |
1252 | 264 | src_rows, src_width, src_height, border_value, | |
1253 | sv_src_stride, sv_width, sv_height, sv_border}; | ||
1254 | 264 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1255 | 264 | } else { | |
1256 | assert(channels == 4); | ||
1257 | typedef typename double_element_width<T>::type DoubleType; | ||
1258 | typedef typename double_element_width<DoubleType>::type QuadType; | ||
1259 | 264 | typename VecTraits<DoubleType>::VectorType sv_width, sv_height, | |
1260 | sv_src_stride; | ||
1261 | 264 | typename VecTraits<QuadType>::VectorType sv_border; | |
1262 | 528 | RemapS16Point5Constant4ch<T> operation{ | |
1263 | 264 | src_rows, src_width, src_height, border_value, | |
1264 | sv_src_stride, sv_width, sv_height, sv_border}; | ||
1265 | 264 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1266 | 264 | } | |
1267 | 528 | } else { | |
1268 | assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); | ||
1269 | 532 | svint16_t sv_xmax, sv_ymax; | |
1270 |
4/4✓ Branch 0 taken 134 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 134 times.
✓ Branch 3 taken 132 times.
|
532 | if (channels == 1) { |
1271 | 268 | RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height, | |
1272 | sv_src_stride, sv_xmax, sv_ymax}; | ||
1273 | 268 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1274 | 268 | } else { | |
1275 | assert(channels == 4); | ||
1276 | if constexpr (std::is_same<T, uint8_t>::value) { | ||
1277 | 264 | RemapS16Point5Replicate4ch<T> operation{ | |
1278 | 132 | src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax}; | |
1279 | 132 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1280 | 132 | } | |
1281 | if constexpr (std::is_same<T, uint16_t>::value) { | ||
1282 | 132 | svuint32_t stride; | |
1283 | 132 | svint32_t xmax, ymax; | |
1284 | 132 | RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height, | |
1285 | stride, xmax, ymax}; | ||
1286 | 132 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1287 | 132 | } | |
1288 | } | ||
1289 | 532 | } | |
1290 | 1060 | return KLEIDICV_OK; | |
1291 | 1140 | } | |
1292 | // NOLINTEND(readability-function-cognitive-complexity) | ||
1293 | |||
1294 | #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \ | ||
1295 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \ | ||
1296 | const type* src, size_t src_stride, size_t src_width, size_t src_height, \ | ||
1297 | type* dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ | ||
1298 | size_t channels, const int16_t* mapxy, size_t mapxy_stride, \ | ||
1299 | const uint16_t* mapfrac, size_t mapfrac_stride, \ | ||
1300 | kleidicv_border_type_t border_type, const type* border_value) | ||
1301 | |||
1302 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); | ||
1303 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); | ||
1304 | |||
1305 | } // namespace kleidicv::sve2 | ||
1306 |