Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cassert> | ||
6 | |||
7 | #include "kleidicv/neon.h" | ||
8 | #include "kleidicv/transform/remap.h" | ||
9 | |||
10 | namespace kleidicv::neon { | ||
11 | |||
12 | template <typename ScalarType> | ||
13 | class RemapS16Point5Replicate; | ||
14 | |||
15 | template <> | ||
16 | class RemapS16Point5Replicate<uint8_t> { | ||
17 | public: | ||
18 | using ScalarType = uint8_t; | ||
19 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
20 | using MapVectorType = typename MapVecTraits::VectorType; | ||
21 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
22 | using FracVecTraits = neon::VecTraits<uint16_t>; | ||
23 | using FracVectorType = typename FracVecTraits::VectorType; | ||
24 | |||
25 | 67 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
26 | size_t src_height) | ||
27 | 67 | : src_rows_{src_rows}, | |
28 | 67 | v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
29 | 67 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
30 | 67 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {} | |
31 | |||
32 | 79 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
33 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
34 | 2536 | auto vector_path = [&](size_t step) { | |
35 | 2457 | MapVector2Type xy = vld2q_s16(&mapxy[0]); | |
36 | 2457 | FracVectorType frac = vld1q_u16(&mapfrac[0]); | |
37 | 4914 | uint16x8_t xfrac = | |
38 | 4914 | vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), | |
39 | // extract xfrac = frac[0:4] | ||
40 | 2457 | vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); | |
41 | 4914 | uint16x8_t yfrac = | |
42 | 4914 | vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), | |
43 | // extract yfrac = frac[5:9] | ||
44 | 4914 | vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), | |
45 | 2457 | vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); | |
46 | 2457 | uint16x8_t nxfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
47 | 2457 | uint16x8_t nyfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
48 | |||
49 | // Clamp coordinates to within the dimensions of the source image | ||
50 | 4914 | uint16x8_t x0 = vreinterpretq_u16_s16( | |
51 | 2457 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); | |
52 | 4914 | uint16x8_t y0 = vreinterpretq_u16_s16( | |
53 | 2457 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); | |
54 | |||
55 | // x1 = x0 + 1, except if it's already xmax | ||
56 | 2457 | uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_)); | |
57 | 2457 | uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_)); | |
58 | |||
59 | 4914 | uint16x4_t dst_low = load_and_interpolate( | |
60 | 2457 | vmovl_u16(vget_low_u16(x0)), vget_low_u16(y0), | |
61 | 2457 | vmovl_u16(vget_low_u16(x1)), vget_low_u16(y1), vget_low_u16(xfrac), | |
62 | 2457 | vget_low_u16(yfrac), vget_low_u16(nxfrac), vget_low_u16(nyfrac)); | |
63 | |||
64 | 4914 | uint16x4_t dst_high = load_and_interpolate( | |
65 | 2457 | vmovl_high_u16(x0), vget_high_u16(y0), vmovl_high_u16(x1), | |
66 | 2457 | vget_high_u16(y1), vget_high_u16(xfrac), vget_high_u16(yfrac), | |
67 | 2457 | vget_high_u16(nxfrac), vget_high_u16(nyfrac)); | |
68 | |||
69 | 2457 | vst1_u8(&dst[0], vuzp1_u8(dst_low, dst_high)); | |
70 | 2457 | mapxy += ptrdiff_t(step); | |
71 | 2457 | mapfrac += ptrdiff_t(step); | |
72 | 2457 | dst += ptrdiff_t(step); | |
73 | 2457 | }; | |
74 | 79 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
75 | 79 | loop.unroll_once(vector_path); | |
76 | 158 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
77 | 79 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
78 | 79 | mapxy -= back_step; | |
79 | 79 | mapfrac -= back_step; | |
80 | 79 | dst -= back_step; | |
81 | 116 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
82 | 79 | } | |
83 | |||
84 | private: | ||
85 | 4914 | uint16x4_t load_and_interpolate(uint32x4_t x0, uint16x4_t y0, uint32x4_t x1, | |
86 | uint16x4_t y1, uint16x4_t xfrac, | ||
87 | uint16x4_t yfrac, uint16x4_t nxfrac, | ||
88 | uint16x4_t nyfrac) { | ||
89 | // Calculate offsets from coordinates (y * stride + x) | ||
90 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
91 | 4914 | uint32x4_t offset = vmlal_u16(x0, y0, v_src_stride_); | |
92 | 9828 | uint64_t acc = | |
93 | 9828 | static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
94 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
95 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
96 | 4914 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
97 | 4914 | uint16x4_t a = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
98 | |||
99 | 4914 | offset = vmlal_u16(x1, y0, v_src_stride_); | |
100 | |||
101 | 14742 | acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
102 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
103 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
104 | 4914 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
105 | 4914 | uint16x4_t b = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
106 | |||
107 | 4914 | uint16x4_t line0 = vmla_u16(vmul_u16(xfrac, b), nxfrac, a); | |
108 | |||
109 | 4914 | offset = vmlal_u16(x0, y1, v_src_stride_); | |
110 | |||
111 | 14742 | acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
112 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
113 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
114 | 4914 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
115 | 4914 | uint16x4_t c = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
116 | |||
117 | 9828 | uint32x4_t line0_lerpd = vmlal_u16( | |
118 | 4914 | vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, nyfrac); | |
119 | |||
120 | 4914 | offset = vmlal_u16(x1, y1, v_src_stride_); | |
121 | |||
122 | 14742 | acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) | | |
123 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 16) | | |
124 | 9828 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) << 32) | | |
125 | 4914 | (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 48); | |
126 | 4914 | uint16x4_t d = vreinterpret_u16_u64(vset_lane_u64(acc, vdup_n_u64(0), 0)); | |
127 | |||
128 | 4914 | uint16x4_t line1 = vmla_u16(vmul_u16(xfrac, d), nxfrac, c); | |
129 | 9828 | return vshrn_n_u32(vmlal_u16(line0_lerpd, line1, yfrac), | |
130 | 2 * REMAP16POINT5_FRAC_BITS); | ||
131 | 4914 | } | |
132 | |||
133 | Rows<const ScalarType> src_rows_; | ||
134 | uint16x4_t v_src_stride_; | ||
135 | int16x8_t v_xmax_; | ||
136 | int16x8_t v_ymax_; | ||
137 | }; // end of class RemapS16Point5Replicate<uint8_t> | ||
138 | |||
139 | // Common interpolation function used by all RemapS16Point5 operations except | ||
140 | // 1-channel u8 with replicated borders (RemapS16Point5Replicate<uint8_t>) | ||
141 | // because that processes one half vector in one step | ||
142 | 46065 | static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c, | |
143 | uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac, | ||
144 | uint16x8_t nxfrac, uint16x8_t nyfrac) { | ||
145 | 230325 | auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right, | |
146 | uint16x4_t frac, | ||
147 | uint16x4_t nfrac) -> uint32x4_t { | ||
148 | 184260 | return vmlal_u16(vmull_u16(nfrac, left), frac, right); | |
149 | }; | ||
150 | |||
151 | 138195 | auto interpolate_horizontal_low = [interpolate_horizontal]( | |
152 | uint16x8_t left, uint16x8_t right, | ||
153 | uint16x8_t frac, | ||
154 | uint16x8_t nfrac) -> uint32x4_t { | ||
155 | 184260 | return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), | |
156 | 92130 | vget_low_u16(frac), vget_low_u16(nfrac)); | |
157 | }; | ||
158 | |||
159 | 138195 | auto interpolate_horizontal_high = [interpolate_horizontal]( | |
160 | uint16x8_t left, uint16x8_t right, | ||
161 | uint16x8_t frac, | ||
162 | uint16x8_t nfrac) -> uint32x4_t { | ||
163 | 184260 | return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), | |
164 | 92130 | vget_high_u16(frac), vget_high_u16(nfrac)); | |
165 | }; | ||
166 | |||
167 | // Offset pixel values by 0.5 before rounding down. | ||
168 | 46065 | const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
169 | |||
170 | 138195 | auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac, | |
171 | uint32x4_t nfrac) -> uint32x4_t { | ||
172 | 92130 | uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac); | |
173 | 184260 | return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); | |
174 | 92130 | }; | |
175 | |||
176 | 46065 | uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac); | |
177 | 46065 | uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac); | |
178 | 46065 | uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac); | |
179 | 46065 | uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac); | |
180 | |||
181 | 92130 | uint32x4_t lo = | |
182 | 92130 | interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)), | |
183 | 46065 | vmovl_u16(vget_low_u16(nyfrac))); | |
184 | 92130 | uint32x4_t hi = interpolate_vertical( | |
185 | 46065 | line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac)); | |
186 | |||
187 | // Discard upper 16 bits of each element (low the precision back to original | ||
188 | // 16 bits) | ||
189 | 92130 | uint16x8_t result = | |
190 | 46065 | vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)); | |
191 | 92130 | return result; | |
192 | 46065 | } | |
193 | |||
194 | template <> | ||
195 | class RemapS16Point5Replicate<uint16_t> { | ||
196 | public: | ||
197 | using ScalarType = uint16_t; | ||
198 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
199 | |||
200 | 67 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
201 | size_t src_height) | ||
202 | 67 | : src_rows_{src_rows}, | |
203 | 134 | v_src_element_stride_{vdupq_n_u16( | |
204 | 67 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
205 | 67 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
206 | 67 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))}, | |
207 | 67 | xfrac_{vdupq_n_u16(0)}, | |
208 | 67 | yfrac_{vdupq_n_u16(0)}, | |
209 | 67 | nxfrac_{vdupq_n_u16(0)}, | |
210 | 67 | nyfrac_{vdupq_n_u16(0)}, | |
211 | 67 | x0_{vdupq_n_s16(0)}, | |
212 | 67 | x1_{vdupq_n_s16(0)}, | |
213 | 67 | y0_{vdupq_n_s16(0)}, | |
214 | 67 | y1_{vdupq_n_s16(0)} {} | |
215 | |||
216 | 79 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
217 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
218 | 2476 | auto vector_path = [&](size_t step) { | |
219 | 2397 | prepare_maps(mapxy, mapfrac); | |
220 | 2397 | transform_pixels(dst); | |
221 | |||
222 | 2397 | mapxy += ptrdiff_t(step); | |
223 | 2397 | mapfrac += ptrdiff_t(step); | |
224 | 2397 | dst += ptrdiff_t(step); | |
225 | 2397 | }; | |
226 | 79 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
227 | 79 | loop.unroll_once(vector_path); | |
228 | 158 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
229 | 79 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
230 | 79 | mapxy -= back_step; | |
231 | 79 | mapfrac -= back_step; | |
232 | 79 | dst -= back_step; | |
233 | 116 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
234 | 79 | } | |
235 | |||
236 | 2397 | void prepare_maps(Columns<const int16_t> mapxy, | |
237 | Columns<const uint16_t> mapfrac) { | ||
238 | 2397 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
239 | 2397 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
240 | 2397 | uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); | |
241 | 2397 | uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); | |
242 | 4794 | xfrac_ = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), | |
243 | 2397 | vandq_u16(frac, frac_mask)); | |
244 | 2397 | yfrac_ = vbslq_u16( | |
245 | 2397 | vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), | |
246 | 2397 | vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask)); | |
247 | 2397 | nxfrac_ = vsubq_u16(frac_max, xfrac_); | |
248 | 2397 | nyfrac_ = vsubq_u16(frac_max, yfrac_); | |
249 | |||
250 | // Clamp coordinates to within the dimensions of the source image | ||
251 | 2397 | x0_ = vreinterpretq_u16_s16( | |
252 | 2397 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); | |
253 | 2397 | y0_ = vreinterpretq_u16_s16( | |
254 | 2397 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); | |
255 | |||
256 | // x1 = x0 + 1, except if it's already xmax | ||
257 | 2397 | x1_ = vsubq_u16(x0_, vcltq_s16(xy.val[0], v_xmax_)); | |
258 | 2397 | y1_ = vsubq_u16(y0_, vcltq_s16(xy.val[1], v_ymax_)); | |
259 | 2397 | } | |
260 | |||
261 | 2397 | void transform_pixels(Columns<uint16_t> dst) { | |
262 | 2397 | uint16x8_t a = load_pixels(x0_, y0_); | |
263 | 2397 | uint16x8_t b = load_pixels(x1_, y0_); | |
264 | 2397 | uint16x8_t c = load_pixels(x0_, y1_); | |
265 | 2397 | uint16x8_t d = load_pixels(x1_, y1_); | |
266 | |||
267 | 4794 | uint16x8_t result = | |
268 | 2397 | interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_); | |
269 | |||
270 | 2397 | vst1q_u16(&dst[0], result); | |
271 | 2397 | } | |
272 | |||
273 | 9588 | uint16x8_t load_pixels(int16x8_t x, int16x8_t y) { | |
274 | // Clamp coordinates to within the dimensions of the source image | ||
275 | 19176 | uint16x8_t x_clamped = | |
276 | 9588 | vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(x, vdupq_n_s16(0))), v_xmax_); | |
277 | 19176 | uint16x8_t y_clamped = | |
278 | 9588 | vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y, vdupq_n_s16(0))), v_ymax_); | |
279 | |||
280 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
281 | 19176 | uint32x4_t indices_low = | |
282 | 19176 | vmlal_u16(vmovl_u16(vget_low_u16(x_clamped)), vget_low_u16(y_clamped), | |
283 | 9588 | vget_low_u16(v_src_element_stride_)); | |
284 | 19176 | uint32x4_t indices_high = vmlal_high_u16(vmovl_high_u16(x_clamped), | |
285 | 9588 | y_clamped, v_src_element_stride_); | |
286 | |||
287 | // Read pixels from source | ||
288 | 86292 | uint16x8_t pixels = { | |
289 | 9588 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
290 | 9588 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
291 | 9588 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
292 | 9588 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
293 | 9588 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
294 | 9588 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
295 | 9588 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
296 | 9588 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
297 | }; | ||
298 | |||
299 | 19176 | return pixels; | |
300 | 9588 | } | |
301 | |||
302 | private: | ||
303 | Rows<const ScalarType> src_rows_; | ||
304 | uint16x8_t v_src_element_stride_; | ||
305 | int16x8_t v_xmax_; | ||
306 | int16x8_t v_ymax_; | ||
307 | uint16x8_t xfrac_; | ||
308 | uint16x8_t yfrac_; | ||
309 | uint16x8_t nxfrac_; | ||
310 | uint16x8_t nyfrac_; | ||
311 | int16x8_t x0_; | ||
312 | int16x8_t x1_; | ||
313 | int16x8_t y0_; | ||
314 | int16x8_t y1_; | ||
315 | }; // end of class RemapS16Point5Replicate<uint16_t> | ||
316 | |||
317 | template <typename ScalarType> | ||
318 | class RemapS16Point5ConstantBorder; | ||
319 | |||
320 | template <> | ||
321 | class RemapS16Point5ConstantBorder<uint8_t> { | ||
322 | public: | ||
323 | using ScalarType = uint8_t; | ||
324 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
325 | |||
326 | 66 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
327 | size_t src_width, size_t src_height, | ||
328 | const ScalarType *border_value) | ||
329 | 66 | : src_rows_{src_rows}, | |
330 | 66 | v_src_stride_{vdupq_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
331 | 66 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
332 | 66 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
333 | 66 | v_border_{vdupq_n_u16(static_cast<uint16_t>(*border_value))} {} | |
334 | |||
335 | 78 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
336 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
337 | 2534 | auto vector_path = [&](size_t step) { | |
338 | 2456 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
339 | 2456 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
340 | 2456 | uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); | |
341 | 2456 | uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); | |
342 | 2456 | uint16x8_t xfrac = vandq_u16(frac, frac_mask); | |
343 | 4912 | uint16x8_t yfrac = | |
344 | 2456 | vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); | |
345 | 2456 | uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac); | |
346 | 2456 | uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac); | |
347 | |||
348 | 2456 | uint16x8_t one = vdupq_n_u16(1); | |
349 | 2456 | uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]); | |
350 | 2456 | uint16x8_t y0 = vreinterpretq_u16_s16(xy.val[1]); | |
351 | 2456 | uint16x8_t x1 = vaddq_u16(x0, one); | |
352 | 2456 | uint16x8_t y1 = vaddq_u16(y0, one); | |
353 | |||
354 | 4912 | uint16x8_t a = load_pixels_or_constant_border( | |
355 | 2456 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0); | |
356 | 4912 | uint16x8_t b = load_pixels_or_constant_border( | |
357 | 2456 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0); | |
358 | 4912 | uint16x8_t c = load_pixels_or_constant_border( | |
359 | 2456 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1); | |
360 | 4912 | uint16x8_t d = load_pixels_or_constant_border( | |
361 | 2456 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1); | |
362 | |||
363 | 2456 | uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac); | |
364 | |||
365 | 2456 | vst1_u8(&dst[0], vqmovn_u16(result)); | |
366 | 2456 | mapxy += ptrdiff_t(step); | |
367 | 2456 | mapfrac += ptrdiff_t(step); | |
368 | 2456 | dst += ptrdiff_t(step); | |
369 | 2456 | }; | |
370 | 78 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
371 | 78 | loop.unroll_once(vector_path); | |
372 | 156 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
373 | 78 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
374 | 78 | mapxy -= back_step; | |
375 | 78 | mapfrac -= back_step; | |
376 | 78 | dst -= back_step; | |
377 | 115 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
378 | 78 | } | |
379 | |||
380 | private: | ||
381 | 9824 | uint16x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_, | |
382 | uint16x8_t v_src_element_stride_, | ||
383 | uint16x8_t v_width_, | ||
384 | uint16x8_t v_height_, | ||
385 | uint16x8_t v_border_, uint16x8_t x, | ||
386 | uint16x8_t y) { | ||
387 | // Find whether coordinates are within the image dimensions. | ||
388 | // Negative coordinates are interpreted as large values due to the s16->u16 | ||
389 | // reinterpretation. | ||
390 | 19648 | uint16x8_t in_range = | |
391 | 19648 | vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_), | |
392 | 9824 | vcltq_u16(vreinterpretq_u16_s16(y), v_height_)); | |
393 | |||
394 | // Zero out-of-range coordinates. | ||
395 | 9824 | x = vandq_u16(in_range, x); | |
396 | 9824 | y = vandq_u16(in_range, y); | |
397 | |||
398 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
399 | 19648 | uint32x4_t indices_low = | |
400 | 19648 | vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), | |
401 | 9824 | vget_low_u16(v_src_element_stride_)); | |
402 | 19648 | uint32x4_t indices_high = | |
403 | 9824 | vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); | |
404 | |||
405 | // Read pixels from source | ||
406 | 88416 | uint8x8_t pixels = { | |
407 | 9824 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
408 | 9824 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
409 | 9824 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
410 | 9824 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
411 | 9824 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
412 | 9824 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
413 | 9824 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
414 | 9824 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
415 | }; | ||
416 | // Select between source pixels and border colour | ||
417 | 19648 | return vbslq_u16(in_range, vmovl_u8(pixels), v_border_); | |
418 | 9824 | } | |
419 | |||
420 | Rows<const ScalarType> src_rows_; | ||
421 | uint16x8_t v_src_stride_; | ||
422 | uint16x8_t v_width_; | ||
423 | uint16x8_t v_height_; | ||
424 | uint16x8_t v_border_; | ||
425 | }; // end of class RemapS16Point5ConstantBorder<uint8_t> | ||
426 | |||
427 | template <> | ||
428 | class RemapS16Point5ConstantBorder<uint16_t> { | ||
429 | public: | ||
430 | using ScalarType = uint16_t; | ||
431 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
432 | |||
433 | 66 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
434 | size_t src_width, size_t src_height, | ||
435 | const ScalarType *border_value) | ||
436 | 66 | : src_rows_{src_rows}, | |
437 | 132 | v_src_element_stride_{vdupq_n_u16( | |
438 | 66 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
439 | 66 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
440 | 66 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
441 | 66 | v_border_{vdupq_n_u16(*border_value)}, | |
442 | 66 | xfrac_{vdupq_n_u16(0)}, | |
443 | 66 | yfrac_{vdupq_n_u16(0)}, | |
444 | 66 | nxfrac_{vdupq_n_u16(0)}, | |
445 | 66 | nyfrac_{vdupq_n_u16(0)}, | |
446 | 66 | x0_{vdupq_n_s16(0)}, | |
447 | 66 | x1_{vdupq_n_s16(0)}, | |
448 | 66 | y0_{vdupq_n_s16(0)}, | |
449 | 66 | y1_{vdupq_n_s16(0)} {} | |
450 | |||
451 | 2396 | void prepare_maps(Columns<const int16_t> mapxy, | |
452 | Columns<const uint16_t> mapfrac) { | ||
453 | 2396 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
454 | 2396 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
455 | 2396 | uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); | |
456 | 2396 | uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); | |
457 | 2396 | xfrac_ = vandq_u16(frac, frac_mask); | |
458 | 2396 | yfrac_ = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); | |
459 | 2396 | nxfrac_ = vsubq_u16(frac_max, xfrac_); | |
460 | 2396 | nyfrac_ = vsubq_u16(frac_max, yfrac_); | |
461 | |||
462 | 2396 | uint16x8_t one = vdupq_n_u16(1); | |
463 | 2396 | x0_ = xy.val[0]; | |
464 | 2396 | y0_ = xy.val[1]; | |
465 | 2396 | x1_ = vaddq_u16(x0_, one); | |
466 | 2396 | y1_ = vaddq_u16(y0_, one); | |
467 | 2396 | } | |
468 | |||
469 | 78 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
470 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
471 | 2474 | auto vector_path = [&](size_t step) { | |
472 | 2396 | prepare_maps(mapxy, mapfrac); | |
473 | 2396 | transform_pixels(dst); | |
474 | |||
475 | 2396 | mapxy += ptrdiff_t(step); | |
476 | 2396 | mapfrac += ptrdiff_t(step); | |
477 | 2396 | dst += ptrdiff_t(step); | |
478 | 2396 | }; | |
479 | 78 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
480 | 78 | loop.unroll_once(vector_path); | |
481 | 156 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
482 | 78 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
483 | 78 | mapxy -= back_step; | |
484 | 78 | mapfrac -= back_step; | |
485 | 78 | dst -= back_step; | |
486 | 115 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
487 | 78 | } | |
488 | |||
489 | 2396 | void transform_pixels(Columns<uint16_t> dst) { | |
490 | 2396 | uint16x8_t a = load_pixels(x0_, y0_); | |
491 | 2396 | uint16x8_t b = load_pixels(x1_, y0_); | |
492 | 2396 | uint16x8_t c = load_pixels(x0_, y1_); | |
493 | 2396 | uint16x8_t d = load_pixels(x1_, y1_); | |
494 | |||
495 | 4792 | uint16x8_t result = | |
496 | 2396 | interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_); | |
497 | |||
498 | 2396 | vst1q_u16(&dst[0], result); | |
499 | 2396 | } | |
500 | |||
501 | 9584 | uint16x8_t load_pixels(uint16x8_t x, uint16x8_t y) { | |
502 | // Find whether coordinates are within the image dimensions. | ||
503 | // Negative coordinates are interpreted as large values due to the s16->u16 | ||
504 | // reinterpretation. | ||
505 | 19168 | uint16x8_t in_range = | |
506 | 19168 | vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_), | |
507 | 9584 | vcltq_u16(vreinterpretq_u16_s16(y), v_height_)); | |
508 | |||
509 | // Zero out-of-range coordinates. | ||
510 | 9584 | x = vandq_u16(in_range, x); | |
511 | 9584 | y = vandq_u16(in_range, y); | |
512 | |||
513 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
514 | 19168 | uint32x4_t indices_low = | |
515 | 19168 | vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), | |
516 | 9584 | vget_low_u16(v_src_element_stride_)); | |
517 | 19168 | uint32x4_t indices_high = | |
518 | 9584 | vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); | |
519 | |||
520 | // Read pixels from source | ||
521 | 86256 | uint16x8_t pixels = { | |
522 | 9584 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
523 | 9584 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
524 | 9584 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
525 | 9584 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
526 | 9584 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
527 | 9584 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
528 | 9584 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
529 | 9584 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
530 | }; | ||
531 | // Select between source pixels and border colour | ||
532 | 19168 | return vbslq_u16(in_range, pixels, v_border_); | |
533 | 9584 | } | |
534 | |||
535 | private: | ||
536 | Rows<const ScalarType> src_rows_; | ||
537 | uint16x8_t v_src_element_stride_; | ||
538 | uint16x8_t v_width_; | ||
539 | uint16x8_t v_height_; | ||
540 | uint16x8_t v_border_; | ||
541 | uint16x8_t xfrac_; | ||
542 | uint16x8_t yfrac_; | ||
543 | uint16x8_t nxfrac_; | ||
544 | uint16x8_t nyfrac_; | ||
545 | int16x8_t x0_; | ||
546 | int16x8_t x1_; | ||
547 | int16x8_t y0_; | ||
548 | int16x8_t y1_; | ||
549 | }; // end of class RemapS16Point5ConstantBorder<uint16_t> | ||
550 | |||
551 | 9704 | inline void get_coordinates(Columns<const int16_t> mapxy, | |
552 | Columns<const uint16_t> mapfrac, uint16x8_t &x, | ||
553 | uint16x8_t &y, uint16x8_t &xfrac, | ||
554 | uint16x8_t &yfrac) { | ||
555 | 9704 | int16x8x2_t xy = vld2q_s16(&mapxy[0]); | |
556 | 9704 | x = xy.val[0]; | |
557 | 9704 | y = xy.val[1]; | |
558 | |||
559 | 9704 | uint16x8_t frac = vld1q_u16(&mapfrac[0]); | |
560 | 9704 | xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
561 | 19408 | yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), | |
562 | 9704 | vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
563 | 9704 | } | |
564 | |||
565 | 19408 | inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1, | |
566 | uint16x4_t y1, uint32x4_t &offsets_a, | ||
567 | uint32x4_t &offsets_b, uint32x4_t &offsets_c, | ||
568 | uint32x4_t &offsets_d, | ||
569 | uint16x4_t v_src_element_stride) { | ||
570 | // Multiply by 4 because of channels | ||
571 | 19408 | uint32x4_t x0_scaled = vshll_n_u16(x0, 2); | |
572 | 19408 | uint32x4_t x1_scaled = vshll_n_u16(x1, 2); | |
573 | |||
574 | // Calculate offsets from coordinates (y * element_stride + x) | ||
575 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
576 | 19408 | offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride); | |
577 | 19408 | offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride); | |
578 | 19408 | offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride); | |
579 | 19408 | offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride); | |
580 | 19408 | } | |
581 | |||
582 | inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low, | ||
583 | uint8_t frac_high) { | ||
584 | uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low, | ||
585 | frac_high, frac_high, frac_high, frac_high}; | ||
586 | return vmovl_u8(frac_low_high); | ||
587 | } | ||
588 | |||
589 | 157184 | inline uint64_t load_32bit(const uint8_t *src) { | |
590 | 157184 | uint32_t value = 0; | |
591 | 157184 | memcpy(&value, src, sizeof(uint32_t)); | |
592 | 314368 | return static_cast<uint64_t>(value); | |
593 | 157184 | } | |
594 | |||
595 | 39296 | inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows, | |
596 | uint32x4_t offsets) { | ||
597 | 78592 | uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) | | |
598 | 39296 | (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32); | |
599 | 78592 | uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) | | |
600 | 39296 | (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32); | |
601 | 78592 | return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23)); | |
602 | 39296 | } | |
603 | |||
604 | 4912 | inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) { | |
605 | using ScalarType = uint8_t; | ||
606 | 4912 | neon::VecTraits<ScalarType>::store(res, &dst[0]); | |
607 | 4912 | } | |
608 | |||
609 | 76672 | inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows, | |
610 | uint32x2_t offsets) { | ||
611 | 153344 | return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]), | |
612 | 76672 | vld1_u16(&src_rows[vget_lane_u32(offsets, 1)])); | |
613 | } | ||
614 | |||
615 | 4792 | inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) { | |
616 | using ScalarType = uint16_t; | ||
617 | 4792 | neon::VecTraits<ScalarType>::store(res, &dst[0]); | |
618 | 4792 | } | |
619 | |||
620 | // Replicate border specific functions | ||
621 | 4852 | inline void get_coordinates_replicate(Columns<const int16_t> mapxy, | |
622 | Columns<const uint16_t> mapfrac, | ||
623 | uint16x8_t &x0, uint16x8_t &y0, | ||
624 | uint16x8_t &x1, uint16x8_t &y1, | ||
625 | uint16x8_t &xfrac, uint16x8_t &yfrac, | ||
626 | int16x8_t v_xmax, int16x8_t v_ymax) { | ||
627 | 4852 | get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); | |
628 | |||
629 | // Zero the xfrac (or yfrac) if x (or y) are below zero | ||
630 | 4852 | xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac); | |
631 | 4852 | yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac); | |
632 | |||
633 | // Clamp coordinates to within the dimensions of the source image | ||
634 | 4852 | x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax))); | |
635 | 4852 | y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax))); | |
636 | |||
637 | // x1 = x0 + 1, except if it's already xmax | ||
638 | 4852 | x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax)); | |
639 | 4852 | y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax)); | |
640 | 4852 | } | |
641 | |||
642 | 4912 | inline void load_pixels_u8_4ch_replicate( | |
643 | Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
644 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b, | ||
645 | uint8x16_t &c, uint8x16_t &d) { | ||
646 | 4912 | a = load_4px_4ch(src_rows, offsets_a); | |
647 | 4912 | b = load_4px_4ch(src_rows, offsets_b); | |
648 | 4912 | c = load_4px_4ch(src_rows, offsets_c); | |
649 | 4912 | d = load_4px_4ch(src_rows, offsets_d); | |
650 | 4912 | } | |
651 | |||
652 | 4792 | inline void load_pixels_u16_4ch_replicate( | |
653 | Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
654 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo, | ||
655 | uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo, | ||
656 | uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) { | ||
657 | 4792 | a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); | |
658 | 4792 | b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); | |
659 | 4792 | c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); | |
660 | 4792 | d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); | |
661 | |||
662 | 4792 | a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); | |
663 | 4792 | b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); | |
664 | 4792 | c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); | |
665 | 4792 | d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); | |
666 | 4792 | } | |
667 | |||
668 | template <typename ScalarType> | ||
669 | class RemapS16Point5Replicate4ch; | ||
670 | |||
671 | template <> | ||
672 | class RemapS16Point5Replicate4ch<uint8_t> { | ||
673 | public: | ||
674 | using ScalarType = uint8_t; | ||
675 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
676 | |||
677 | 66 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
678 | size_t src_height) | ||
679 | 66 | : src_rows_{src_rows}, | |
680 | 66 | v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
681 | 66 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
682 | 66 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {} | |
683 | |||
684 | 78 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
685 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
686 | 2534 | auto vector_path = [&](size_t step) { | |
687 | 2456 | uint16x8_t x0, y0, x1, y1; | |
688 | 2456 | uint16x8_t xfrac, yfrac; | |
689 | 4912 | get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, | |
690 | 2456 | v_xmax_, v_ymax_); | |
691 | |||
692 | 2456 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
693 | 2456 | uint8x16_t a, b, c, d; | |
694 | 2456 | uint8x16x2_t res; | |
695 | |||
696 | 4912 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
697 | 2456 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
698 | 2456 | offsets_d, v_src_stride_); | |
699 | 4912 | load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
700 | 2456 | offsets_d, a, b, c, d); | |
701 | |||
702 | // Doubled fractions 001122..., low part | ||
703 | 2456 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
704 | 2456 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
705 | 4912 | uint16x8_t nxfrac2 = | |
706 | 2456 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
707 | 4912 | uint16x8_t nyfrac2 = | |
708 | 2456 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
709 | // Quadrupled fractions (00001111) are passed to interpolate | ||
710 | 4912 | uint16x8_t res0 = interpolate( | |
711 | 2456 | vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)), | |
712 | 2456 | vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
713 | 2456 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
714 | 4912 | uint16x8_t res1 = interpolate( | |
715 | 2456 | vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
716 | 2456 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2), | |
717 | 2456 | vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2)); | |
718 | 2456 | res.val[0] = | |
719 | 2456 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
720 | |||
721 | 4912 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
722 | 2456 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
723 | 2456 | offsets_d, v_src_stride_); | |
724 | 4912 | load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
725 | 2456 | offsets_d, a, b, c, d); | |
726 | // Doubled fractions 001122..., high part | ||
727 | 2456 | xfrac2 = vzip2q(xfrac, xfrac); | |
728 | 2456 | yfrac2 = vzip2q(yfrac, yfrac); | |
729 | 2456 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
730 | 2456 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
731 | // Quadrupled fractions (00001111) are passed to interpolate | ||
732 | 4912 | res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), | |
733 | 2456 | vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)), | |
734 | 2456 | vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
735 | 2456 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
736 | 4912 | res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
737 | 2456 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), | |
738 | 2456 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
739 | 2456 | vzip2q(nyfrac2, nyfrac2)); | |
740 | 2456 | res.val[1] = | |
741 | 2456 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
742 | |||
743 | 2456 | store_pixels_u8_4ch(res, dst); | |
744 | 2456 | mapxy += ptrdiff_t(step); | |
745 | 2456 | mapfrac += ptrdiff_t(step); | |
746 | 2456 | dst += ptrdiff_t(step); | |
747 | 2456 | }; | |
748 | |||
749 | 78 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
750 | 78 | loop.unroll_once(vector_path); | |
751 | 156 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
752 | 78 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
753 | 78 | mapxy -= back_step; | |
754 | 78 | mapfrac -= back_step; | |
755 | 78 | dst -= back_step; | |
756 | 115 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
757 | 78 | } | |
758 | |||
759 | private: | ||
760 | Rows<const ScalarType> src_rows_; | ||
761 | uint16x4_t v_src_stride_; | ||
762 | int16x8_t v_xmax_; | ||
763 | int16x8_t v_ymax_; | ||
764 | }; // end of class RemapS16Point5Replicate4ch<uint8_t> | ||
765 | |||
766 | template <> | ||
767 | class RemapS16Point5Replicate4ch<uint16_t> { | ||
768 | public: | ||
769 | using ScalarType = uint16_t; | ||
770 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
771 | |||
772 | 66 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
773 | size_t src_height) | ||
774 | 66 | : src_rows_{src_rows}, | |
775 | 132 | v_src_element_stride_{vdup_n_u16( | |
776 | 66 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
777 | 66 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
778 | 66 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {} | |
779 | |||
780 | 78 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
781 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
782 | 2474 | auto vector_path = [&](size_t step) { | |
783 | 2396 | uint16x8_t x0, y0, x1, y1; | |
784 | 2396 | uint16x8_t xfrac, yfrac; | |
785 | 4792 | get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, | |
786 | 2396 | v_xmax_, v_ymax_); | |
787 | |||
788 | 2396 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
789 | 2396 | uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; | |
790 | 2396 | uint16x8x4_t res; | |
791 | 4792 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
792 | 2396 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
793 | 2396 | offsets_d, v_src_element_stride_); | |
794 | 4792 | load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
795 | 2396 | offsets_d, a_low, a_high, b_low, b_high, | |
796 | c_low, c_high, d_low, d_high); | ||
797 | |||
798 | // Doubled fractions 001122..., low part | ||
799 | 2396 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
800 | 2396 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
801 | 4792 | uint16x8_t nxfrac2 = | |
802 | 2396 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
803 | 4792 | uint16x8_t nyfrac2 = | |
804 | 2396 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
805 | // Quadrupled fractions (00001111) are passed to interpolate | ||
806 | 2396 | res.val[0] = | |
807 | 4792 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
808 | 2396 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
809 | 2396 | vzip1q(nyfrac2, nyfrac2)); | |
810 | 2396 | res.val[1] = | |
811 | 4792 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
812 | 2396 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
813 | 2396 | vzip2q(nyfrac2, nyfrac2)); | |
814 | |||
815 | 4792 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
816 | 2396 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
817 | 2396 | offsets_d, v_src_element_stride_); | |
818 | 4792 | load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, | |
819 | 2396 | offsets_d, a_low, a_high, b_low, b_high, | |
820 | c_low, c_high, d_low, d_high); | ||
821 | // Doubled fractions 001122..., high part | ||
822 | 2396 | xfrac2 = vzip2q(xfrac, xfrac); | |
823 | 2396 | yfrac2 = vzip2q(yfrac, yfrac); | |
824 | 2396 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
825 | 2396 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
826 | // Quadrupled fractions (00001111) are passed to interpolate | ||
827 | 2396 | res.val[2] = | |
828 | 4792 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
829 | 2396 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
830 | 2396 | vzip1q(nyfrac2, nyfrac2)); | |
831 | 2396 | res.val[3] = | |
832 | 4792 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
833 | 2396 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
834 | 2396 | vzip2q(nyfrac2, nyfrac2)); | |
835 | |||
836 | 2396 | store_pixels_u16_4ch(res, dst); | |
837 | 2396 | mapxy += ptrdiff_t(step); | |
838 | 2396 | mapfrac += ptrdiff_t(step); | |
839 | 2396 | dst += ptrdiff_t(step); | |
840 | 2396 | }; | |
841 | |||
842 | 78 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
843 | 78 | loop.unroll_once(vector_path); | |
844 | 156 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
845 | 78 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
846 | 78 | mapxy -= back_step; | |
847 | 78 | mapfrac -= back_step; | |
848 | 78 | dst -= back_step; | |
849 | 115 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
850 | 78 | } | |
851 | |||
852 | private: | ||
853 | Rows<const ScalarType> src_rows_; | ||
854 | uint16x4_t v_src_element_stride_; | ||
855 | int16x8_t v_xmax_; | ||
856 | int16x8_t v_ymax_; | ||
857 | }; // end of class RemapS16Point5Replicate4ch<uint16_t> | ||
858 | |||
859 | // Constant border specific functions | ||
860 | 4852 | inline void get_coordinates_constant( | |
861 | Columns<const int16_t> mapxy, Columns<const uint16_t> mapfrac, | ||
862 | uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0, | ||
863 | uint16x8_t &x1, uint16x8_t &y1, uint16x8_t &xfrac, uint16x8_t &yfrac, | ||
864 | uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c, | ||
865 | uint16x8_t &in_range_d) { | ||
866 | 4852 | get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); | |
867 | |||
868 | 4852 | uint16x8_t one = vdupq_n_u16(1); | |
869 | 4852 | x1 = vaddq_u16(x0, one); | |
870 | 4852 | y1 = vaddq_u16(y0, one); | |
871 | |||
872 | 4852 | uint16x8_t x0_in_range = vcltq_u16(x0, v_width); | |
873 | 4852 | uint16x8_t y0_in_range = vcltq_u16(y0, v_height); | |
874 | 4852 | uint16x8_t x1_in_range = vcltq_u16(x1, v_width); | |
875 | 4852 | uint16x8_t y1_in_range = vcltq_u16(y1, v_height); | |
876 | |||
877 | 4852 | in_range_a = vandq(x0_in_range, y0_in_range); | |
878 | 4852 | in_range_b = vandq(x1_in_range, y0_in_range); | |
879 | 4852 | in_range_c = vandq(x0_in_range, y1_in_range); | |
880 | 4852 | in_range_d = vandq(x1_in_range, y1_in_range); | |
881 | 4852 | } | |
882 | |||
883 | 38816 | inline uint32x4_t zero_out_of_range_offsets(uint32x4_t in_range, | |
884 | uint32x4_t offsets) { | ||
885 | 38816 | return vbslq_u32(in_range, offsets, vdupq_n_u32(0)); | |
886 | } | ||
887 | |||
888 | 19648 | inline uint8x16_t replace_pixel_with_border_u8_4ch(uint32x4_t in_range, | |
889 | uint8x16_t pixels, | ||
890 | uint8x16_t v_border) { | ||
891 | 19648 | return vreinterpretq_u8_u32( | |
892 | 19648 | vbslq_u32(in_range, vreinterpretq_u32_u8(pixels), v_border)); | |
893 | } | ||
894 | |||
895 | 38336 | inline uint16x8_t replace_pixel_with_border_u16_4ch(uint64x2_t in_range, | |
896 | uint16x8_t pixels, | ||
897 | uint16x8_t v_border) { | ||
898 | 38336 | return vreinterpretq_u16_u64( | |
899 | 38336 | vbslq_u64(in_range, vreinterpretq_u64_u16(pixels), v_border)); | |
900 | } | ||
901 | |||
902 | 4912 | inline void load_pixels_u8_4ch_constant( | |
903 | Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
904 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a, | ||
905 | uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d, | ||
906 | uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c, | ||
907 | uint8x16_t &d) { | ||
908 | 4912 | offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); | |
909 | 4912 | offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); | |
910 | 4912 | offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); | |
911 | 4912 | offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); | |
912 | |||
913 | 4912 | a = load_4px_4ch(src_rows, offsets_a); | |
914 | 4912 | b = load_4px_4ch(src_rows, offsets_b); | |
915 | 4912 | c = load_4px_4ch(src_rows, offsets_c); | |
916 | 4912 | d = load_4px_4ch(src_rows, offsets_d); | |
917 | |||
918 | 4912 | a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border); | |
919 | 4912 | b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border); | |
920 | 4912 | c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border); | |
921 | 4912 | d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border); | |
922 | 4912 | } | |
923 | |||
924 | 4792 | inline void load_pixels_u16_4ch_constant( | |
925 | Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, | ||
926 | uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a, | ||
927 | uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d, | ||
928 | uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo, | ||
929 | uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo, | ||
930 | uint16x8_t &d_hi) { | ||
931 | 4792 | offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); | |
932 | 4792 | offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); | |
933 | 4792 | offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); | |
934 | 4792 | offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); | |
935 | |||
936 | 4792 | a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); | |
937 | 4792 | b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); | |
938 | 4792 | c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); | |
939 | 4792 | d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); | |
940 | |||
941 | // Convert bitsets such as in_range to 64bits, making all 1s or all 0s | ||
942 | 23960 | auto low32_to_u64 = [](uint32x4_t bitset) { | |
943 | 19168 | return vreinterpretq_u64_s64( | |
944 | 19168 | vmovl_s32(vreinterpret_s32_u32(vget_low_u32(bitset)))); | |
945 | }; | ||
946 | |||
947 | 9584 | a_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_a), a_lo, | |
948 | 4792 | v_border); | |
949 | 9584 | b_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_b), b_lo, | |
950 | 4792 | v_border); | |
951 | 9584 | c_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_c), c_lo, | |
952 | 4792 | v_border); | |
953 | 9584 | d_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_d), d_lo, | |
954 | 4792 | v_border); | |
955 | |||
956 | 4792 | a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); | |
957 | 4792 | b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); | |
958 | 4792 | c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); | |
959 | 4792 | d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); | |
960 | |||
961 | // Convert bitsets such as in_range to 64bits, making all 1s or all 0s | ||
962 | 23960 | auto hi32_to_u64 = [](uint32x4_t bitset) { | |
963 | 19168 | return vreinterpretq_u64_s64(vmovl_high_s32(vreinterpretq_s32_u32(bitset))); | |
964 | }; | ||
965 | |||
966 | 9584 | a_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_a), a_hi, | |
967 | 4792 | v_border); | |
968 | 9584 | b_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_b), b_hi, | |
969 | 4792 | v_border); | |
970 | 9584 | c_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_c), c_hi, | |
971 | 4792 | v_border); | |
972 | 9584 | d_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_d), d_hi, | |
973 | 4792 | v_border); | |
974 | 4792 | } | |
975 | |||
976 | // Convert bitsets such as in_range to 32bits, making all 1s or all 0s | ||
977 | 19408 | static uint32x4_t low16_to_s32(uint16x8_t bitset) { | |
978 | 19408 | return vreinterpretq_u32_s32( | |
979 | 19408 | vmovl_s16(vreinterpret_s16_u16(vget_low_u16(bitset)))); | |
980 | } | ||
981 | |||
982 | 19408 | static uint32x4_t hi16_to_s32(uint16x8_t bitset) { | |
983 | 19408 | return vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(bitset))); | |
984 | } | ||
985 | |||
986 | template <typename ScalarType> | ||
987 | class RemapS16Point5Constant4ch; | ||
988 | |||
989 | template <> | ||
990 | class RemapS16Point5Constant4ch<uint8_t> { | ||
991 | public: | ||
992 | using ScalarType = uint8_t; | ||
993 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
994 | |||
995 | 66 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
996 | size_t src_height, const ScalarType *border_value) | ||
997 | 66 | : src_rows_{src_rows}, | |
998 | 66 | v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))}, | |
999 | 66 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
1000 | 66 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
1001 | 66 | v_border_{} { | |
1002 | 66 | uint32_t border_value_32{}; | |
1003 | 66 | memcpy(&border_value_32, border_value, sizeof(uint32_t)); | |
1004 | 66 | v_border_ = vreinterpretq_u8_u32(vdupq_n_u32(border_value_32)); | |
1005 | 66 | } | |
1006 | |||
1007 | 78 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
1008 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
1009 | 2534 | auto vector_path = [&](size_t step) { | |
1010 | 2456 | uint16x8_t x0, y0, x1, y1; | |
1011 | 2456 | uint16x8_t xfrac, yfrac; | |
1012 | 2456 | uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; | |
1013 | 2456 | get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, | |
1014 | y1, xfrac, yfrac, in_range_a, in_range_b, | ||
1015 | in_range_c, in_range_d); | ||
1016 | |||
1017 | 2456 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
1018 | 2456 | uint8x16_t a, b, c, d; | |
1019 | 2456 | uint8x16x2_t res; | |
1020 | |||
1021 | 4912 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
1022 | 2456 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
1023 | 2456 | offsets_d, v_src_stride_); | |
1024 | |||
1025 | 2456 | load_pixels_u8_4ch_constant( | |
1026 | 2456 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
1027 | 2456 | low16_to_s32(in_range_a), low16_to_s32(in_range_b), | |
1028 | 2456 | low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a, b, | |
1029 | c, d); | ||
1030 | |||
1031 | // Doubled fractions 001122..., low part | ||
1032 | 2456 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
1033 | 2456 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
1034 | 4912 | uint16x8_t nxfrac2 = | |
1035 | 2456 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
1036 | 4912 | uint16x8_t nyfrac2 = | |
1037 | 2456 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
1038 | // Quadrupled fractions (00001111) are passed to interpolate | ||
1039 | 4912 | uint16x8_t res0 = interpolate( | |
1040 | 2456 | vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)), | |
1041 | 2456 | vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
1042 | 2456 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
1043 | 4912 | uint16x8_t res1 = interpolate( | |
1044 | 2456 | vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
1045 | 2456 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2), | |
1046 | 2456 | vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2)); | |
1047 | 2456 | res.val[0] = | |
1048 | 2456 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
1049 | |||
1050 | 4912 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
1051 | 2456 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
1052 | 2456 | offsets_d, v_src_stride_); | |
1053 | |||
1054 | 2456 | load_pixels_u8_4ch_constant( | |
1055 | 2456 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
1056 | 2456 | hi16_to_s32(in_range_a), hi16_to_s32(in_range_b), | |
1057 | 2456 | hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a, b, c, | |
1058 | d); | ||
1059 | // Doubled fractions 001122..., high part | ||
1060 | 2456 | xfrac2 = vzip2q(xfrac, xfrac); | |
1061 | 2456 | yfrac2 = vzip2q(yfrac, yfrac); | |
1062 | 2456 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
1063 | 2456 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
1064 | // Quadrupled fractions (00001111) are passed to interpolate | ||
1065 | 4912 | res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), | |
1066 | 2456 | vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)), | |
1067 | 2456 | vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), | |
1068 | 2456 | vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); | |
1069 | 4912 | res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), | |
1070 | 2456 | vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), | |
1071 | 2456 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
1072 | 2456 | vzip2q(nyfrac2, nyfrac2)); | |
1073 | 2456 | res.val[1] = | |
1074 | 2456 | vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); | |
1075 | |||
1076 | 2456 | store_pixels_u8_4ch(res, dst); | |
1077 | 2456 | mapxy += ptrdiff_t(step); | |
1078 | 2456 | mapfrac += ptrdiff_t(step); | |
1079 | 2456 | dst += ptrdiff_t(step); | |
1080 | 2456 | }; | |
1081 | |||
1082 | 78 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
1083 | 78 | loop.unroll_once(vector_path); | |
1084 | 156 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
1085 | 78 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
1086 | 78 | mapxy -= back_step; | |
1087 | 78 | mapfrac -= back_step; | |
1088 | 78 | dst -= back_step; | |
1089 | 115 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
1090 | 78 | } | |
1091 | |||
1092 | private: | ||
1093 | Rows<const ScalarType> src_rows_; | ||
1094 | uint16x4_t v_src_stride_; | ||
1095 | uint16x8_t v_width_; | ||
1096 | uint16x8_t v_height_; | ||
1097 | uint8x16_t v_border_; | ||
1098 | }; // end of class RemapS16Point5Constant4ch<uint8_t> | ||
1099 | |||
1100 | template <> | ||
1101 | class RemapS16Point5Constant4ch<uint16_t> { | ||
1102 | public: | ||
1103 | using ScalarType = uint16_t; | ||
1104 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
1105 | |||
1106 | 66 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
1107 | size_t src_height, const ScalarType *border_value) | ||
1108 | 66 | : src_rows_{src_rows}, | |
1109 | 132 | v_src_element_stride_{vdup_n_u16( | |
1110 | 66 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
1111 | 66 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
1112 | 66 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
1113 | 66 | v_border_{} { | |
1114 | 66 | uint64_t border_value_64{}; | |
1115 | 66 | memcpy(&border_value_64, border_value, sizeof(uint64_t)); | |
1116 | 66 | v_border_ = vreinterpretq_u16_u64(vdupq_n_u64(border_value_64)); | |
1117 | 66 | } | |
1118 | |||
1119 | 78 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
1120 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
1121 | 2474 | auto vector_path = [&](size_t step) { | |
1122 | 2396 | uint16x8_t x0, y0, x1, y1; | |
1123 | 2396 | uint16x8_t xfrac, yfrac; | |
1124 | 2396 | uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; | |
1125 | 2396 | get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, | |
1126 | y1, xfrac, yfrac, in_range_a, in_range_b, | ||
1127 | in_range_c, in_range_d); | ||
1128 | |||
1129 | 2396 | uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; | |
1130 | 2396 | uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; | |
1131 | 2396 | uint16x8x4_t res; | |
1132 | |||
1133 | 4792 | get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), | |
1134 | 2396 | vget_low_u16(y1), offsets_a, offsets_b, offsets_c, | |
1135 | 2396 | offsets_d, v_src_element_stride_); | |
1136 | |||
1137 | 2396 | load_pixels_u16_4ch_constant( | |
1138 | 2396 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
1139 | 2396 | low16_to_s32(in_range_a), low16_to_s32(in_range_b), | |
1140 | 2396 | low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a_low, | |
1141 | a_high, b_low, b_high, c_low, c_high, d_low, d_high); | ||
1142 | |||
1143 | // Doubled fractions 001122..., low part | ||
1144 | 2396 | uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); | |
1145 | 2396 | uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); | |
1146 | 4792 | uint16x8_t nxfrac2 = | |
1147 | 2396 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
1148 | 4792 | uint16x8_t nyfrac2 = | |
1149 | 2396 | vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
1150 | // Quadrupled fractions (00001111) are passed to interpolate | ||
1151 | 2396 | res.val[0] = | |
1152 | 4792 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
1153 | 2396 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
1154 | 2396 | vzip1q(nyfrac2, nyfrac2)); | |
1155 | 2396 | res.val[1] = | |
1156 | 4792 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
1157 | 2396 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
1158 | 2396 | vzip2q(nyfrac2, nyfrac2)); | |
1159 | |||
1160 | 4792 | get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), | |
1161 | 2396 | vget_high_u16(y1), offsets_a, offsets_b, offsets_c, | |
1162 | 2396 | offsets_d, v_src_element_stride_); | |
1163 | |||
1164 | 2396 | load_pixels_u16_4ch_constant( | |
1165 | 2396 | src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, | |
1166 | 2396 | hi16_to_s32(in_range_a), hi16_to_s32(in_range_b), | |
1167 | 2396 | hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a_low, | |
1168 | a_high, b_low, b_high, c_low, c_high, d_low, d_high); | ||
1169 | |||
1170 | // Doubled fractions 001122..., high part | ||
1171 | 2396 | xfrac2 = vzip2q(xfrac, xfrac); | |
1172 | 2396 | yfrac2 = vzip2q(yfrac, yfrac); | |
1173 | 2396 | nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); | |
1174 | 2396 | nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); | |
1175 | // Quadrupled fractions (00001111) are passed to interpolate | ||
1176 | 2396 | res.val[2] = | |
1177 | 4792 | interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), | |
1178 | 2396 | vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), | |
1179 | 2396 | vzip1q(nyfrac2, nyfrac2)); | |
1180 | 2396 | res.val[3] = | |
1181 | 4792 | interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), | |
1182 | 2396 | vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), | |
1183 | 2396 | vzip2q(nyfrac2, nyfrac2)); | |
1184 | |||
1185 | 2396 | store_pixels_u16_4ch(res, dst); | |
1186 | 2396 | mapxy += ptrdiff_t(step); | |
1187 | 2396 | mapfrac += ptrdiff_t(step); | |
1188 | 2396 | dst += ptrdiff_t(step); | |
1189 | 2396 | }; | |
1190 | |||
1191 | 78 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
1192 | 78 | loop.unroll_once(vector_path); | |
1193 | 156 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
1194 | 78 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
1195 | 78 | mapxy -= back_step; | |
1196 | 78 | mapfrac -= back_step; | |
1197 | 78 | dst -= back_step; | |
1198 | 115 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
1199 | 78 | } | |
1200 | |||
1201 | private: | ||
1202 | Rows<const ScalarType> src_rows_; | ||
1203 | uint16x4_t v_src_element_stride_; | ||
1204 | uint16x8_t v_width_; | ||
1205 | uint16x8_t v_height_; | ||
1206 | uint16x8_t v_border_; | ||
1207 | }; // end of class RemapS16Point5Constant4ch<uint16_t> | ||
1208 | |||
1209 | // Most of the complexity comes from parameter checking. | ||
1210 | // NOLINTBEGIN(readability-function-cognitive-complexity) | ||
1211 | template <typename T> | ||
1212 | 570 | kleidicv_error_t remap_s16point5( | |
1213 | const T *src, size_t src_stride, size_t src_width, size_t src_height, | ||
1214 | T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
1215 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
1216 | const uint16_t *mapfrac, size_t mapfrac_stride, | ||
1217 | [[maybe_unused]] kleidicv_border_type_t border_type, | ||
1218 | [[maybe_unused]] const T *border_value) { | ||
1219 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 284 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 284 times.
|
570 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
1220 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 283 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 283 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 283 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 283 times.
|
568 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
1221 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 282 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 282 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 282 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 282 times.
|
566 | CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); |
1222 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 281 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 281 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 281 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 281 times.
|
564 | CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height); |
1223 |
12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 280 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 278 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 278 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 280 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 278 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 278 times.
|
562 | CHECK_IMAGE_SIZE(src_width, src_height); |
1224 |
12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 277 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 276 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 276 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 277 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 276 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 276 times.
|
556 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
1225 |
8/8✓ Branch 0 taken 134 times.
✓ Branch 1 taken 142 times.
✓ Branch 2 taken 133 times.
✓ Branch 3 taken 1 times.
✓ Branch 4 taken 134 times.
✓ Branch 5 taken 142 times.
✓ Branch 6 taken 133 times.
✓ Branch 7 taken 1 times.
|
552 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { |
1226 | 2 | return KLEIDICV_ERROR_NULL_POINTER; | |
1227 | } | ||
1228 | |||
1229 |
8/8✓ Branch 0 taken 265 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 265 times.
✓ Branch 3 taken 10 times.
✓ Branch 4 taken 265 times.
✓ Branch 5 taken 10 times.
✓ Branch 6 taken 265 times.
✓ Branch 7 taken 10 times.
|
1100 | if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height, |
1230 | 550 | dst_width, border_type, channels)) { | |
1231 | 20 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1232 | } | ||
1233 | |||
1234 | 530 | Rows<const T> src_rows{src, src_stride, channels}; | |
1235 | 530 | Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2}; | |
1236 | 530 | Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1}; | |
1237 | 530 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
1238 | 530 | Rectangle rect{dst_width, dst_height}; | |
1239 |
4/4✓ Branch 0 taken 133 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 133 times.
✓ Branch 3 taken 132 times.
|
530 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { |
1240 |
4/4✓ Branch 0 taken 66 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 66 times.
✓ Branch 3 taken 66 times.
|
264 | if (channels == 1) { |
1241 | 264 | RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height, | |
1242 | 132 | border_value}; | |
1243 | 132 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1244 | 132 | } else { | |
1245 | assert(channels == 4); | ||
1246 | 264 | RemapS16Point5Constant4ch<T> operation{src_rows, src_width, src_height, | |
1247 | 132 | border_value}; | |
1248 | 132 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1249 | 132 | } | |
1250 | 264 | } else { | |
1251 | assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); | ||
1252 |
4/4✓ Branch 0 taken 67 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 67 times.
✓ Branch 3 taken 66 times.
|
266 | if (channels == 1) { |
1253 | 134 | RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height}; | |
1254 | 134 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1255 | 134 | } else { | |
1256 | assert(channels == 4); | ||
1257 | 132 | RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height}; | |
1258 | 132 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
1259 | 132 | } | |
1260 | } | ||
1261 | 530 | return KLEIDICV_OK; | |
1262 | 570 | } | |
1263 | // NOLINTEND(readability-function-cognitive-complexity) | ||
1264 | |||
1265 | #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \ | ||
1266 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \ | ||
1267 | const type *src, size_t src_stride, size_t src_width, size_t src_height, \ | ||
1268 | type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ | ||
1269 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, \ | ||
1270 | const uint16_t *mapfrac, size_t mapfrac_stride, \ | ||
1271 | kleidicv_border_type_t border_type, const type *border_value) | ||
1272 | |||
1273 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); | ||
1274 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); | ||
1275 | |||
1276 | } // namespace kleidicv::neon | ||
1277 |