Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include "kleidicv/ctypes.h" | ||
6 | #include "kleidicv/neon.h" | ||
7 | #include "kleidicv/types.h" | ||
8 | #include "transform_common.h" | ||
9 | |||
10 | namespace kleidicv::neon { | ||
11 | |||
12 | typedef struct { | ||
13 | float32x4_t x, y; | ||
14 | } FloatVectorPair; | ||
15 | |||
16 | template <typename ScalarType, bool IsLarge> | ||
17 | 395012 | float32x4_t inline load_xy(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride, | |
18 | Rows<const ScalarType>& src_rows) { | ||
19 | if constexpr (IsLarge) { | ||
20 | 2592 | uint64x2_t indices_low = | |
21 | 2592 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
22 | 1296 | vget_low_u32(v_src_stride)); | |
23 | 2592 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
24 | 1296 | vget_low_u32(v_src_stride)); | |
25 | 2592 | uint64_t acc = | |
26 | 2592 | static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 0)]) | | |
27 | 1296 | (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 1)]) << 32); | |
28 | 1296 | uint64x2_t rawsrc = vdupq_n_u64(acc); | |
29 | 2592 | acc = static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 0)]) | | |
30 | 1296 | (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 1)]) | |
31 | 1296 | << 32); | |
32 | 1296 | rawsrc = vsetq_lane_u64(acc, rawsrc, 1); | |
33 | 2592 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
34 | 1296 | } else { | |
35 | 393716 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
36 | 787432 | uint64_t acc = | |
37 | 787432 | static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 0)]) | | |
38 | 393716 | (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 1)]) << 32); | |
39 | 393716 | uint64x2_t rawsrc = vdupq_n_u64(acc); | |
40 | 787432 | acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 2)]) | | |
41 | 393716 | (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 3)]) << 32); | |
42 | 393716 | rawsrc = vsetq_lane_u64(acc, rawsrc, 1); | |
43 | 787432 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
44 | 393716 | } | |
45 | } | ||
46 | |||
47 | template <typename ScalarType, bool IsLarge> | ||
48 | 32584 | float32x4x2_t inline load_xy_2ch(uint32x4_t x, uint32x4_t y, | |
49 | uint32x4_t v_src_stride, | ||
50 | Rows<const ScalarType>& src_rows) { | ||
51 | 32584 | const size_t kBytes = 2 * sizeof(ScalarType); | |
52 | 32584 | ScalarType elements[4 * 2]; // 4 pixels, 2 channels | |
53 | // Multiply x with the number of channels (2) | ||
54 | 32584 | x = vshlq_n_u32(x, 1); | |
55 | if constexpr (IsLarge) { | ||
56 | 576 | uint64x2_t indices_low = | |
57 | 576 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
58 | 288 | vget_low_u32(v_src_stride)); | |
59 | 576 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
60 | 288 | vget_low_u32(v_src_stride)); | |
61 | 288 | memcpy(&elements[0], &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes); | |
62 | 288 | memcpy(&elements[2], &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes); | |
63 | 288 | memcpy(&elements[4], &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes); | |
64 | 288 | memcpy(&elements[6], &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes); | |
65 | 288 | } else { | |
66 | 32296 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
67 | 32296 | memcpy(&elements[0], &src_rows[vgetq_lane_u32(indices, 0)], kBytes); | |
68 | 32296 | memcpy(&elements[2], &src_rows[vgetq_lane_u32(indices, 1)], kBytes); | |
69 | 32296 | memcpy(&elements[4], &src_rows[vgetq_lane_u32(indices, 2)], kBytes); | |
70 | 32296 | memcpy(&elements[6], &src_rows[vgetq_lane_u32(indices, 3)], kBytes); | |
71 | 32296 | } | |
72 | 32584 | uint16x8_t pixels16{}; | |
73 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
74 | 16556 | pixels16 = vmovl_u8(vld1_u8(elements)); | |
75 | } else if constexpr (std::is_same<ScalarType, uint16_t>::value) { | ||
76 | 16028 | pixels16 = vld1q_u16(elements); | |
77 | } | ||
78 | float32x4x2_t result; | ||
79 | 32584 | result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16))); | |
80 | 32584 | result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16)); | |
81 | return result; | ||
82 | 32584 | } | |
83 | |||
84 | template <typename ScalarType, bool IsLarge> | ||
85 | 202904 | float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y, | |
86 | uint32x4_t in_range, | ||
87 | ScalarType border_value, | ||
88 | uint32x4_t v_src_stride, | ||
89 | Rows<const ScalarType> src_rows) { | ||
90 | if constexpr (IsLarge) { | ||
91 | 1584 | uint64x2_t indices_low = | |
92 | 1584 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
93 | 792 | vget_low_u32(v_src_stride)); | |
94 | 1584 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
95 | 792 | vget_low_u32(v_src_stride)); | |
96 |
4/4✓ Branch 0 taken 554 times.
✓ Branch 1 taken 142 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
|
792 | uint64_t pixel0 = vgetq_lane_u32(in_range, 0) |
97 | 578 | ? src_rows[vgetq_lane_u64(indices_low, 0)] | |
98 | 214 | : border_value; | |
99 |
4/4✓ Branch 0 taken 529 times.
✓ Branch 1 taken 167 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
|
792 | uint64_t pixel1 = vgetq_lane_u32(in_range, 1) |
100 | 553 | ? src_rows[vgetq_lane_u64(indices_low, 1)] | |
101 | 239 | : border_value; | |
102 |
4/4✓ Branch 0 taken 530 times.
✓ Branch 1 taken 166 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 82 times.
|
792 | uint64_t pixel2 = vgetq_lane_u32(in_range, 2) |
103 | 544 | ? src_rows[vgetq_lane_u64(indices_high, 0)] | |
104 | 248 | : border_value; | |
105 |
4/4✓ Branch 0 taken 538 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
|
792 | uint64_t pixel3 = vgetq_lane_u32(in_range, 3) |
106 | 562 | ? src_rows[vgetq_lane_u64(indices_high, 1)] | |
107 | 230 | : border_value; | |
108 | 792 | uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
109 | 792 | rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
110 | 792 | rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1); | |
111 | 1584 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
112 | 792 | } else { | |
113 | 202112 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
114 |
4/4✓ Branch 0 taken 94757 times.
✓ Branch 1 taken 91519 times.
✓ Branch 2 taken 15609 times.
✓ Branch 3 taken 227 times.
|
202112 | uint64_t pixel0 = vgetq_lane_u32(in_range, 0) |
115 | 110366 | ? src_rows[vgetq_lane_u32(indices, 0)] | |
116 | 91746 | : border_value; | |
117 |
4/4✓ Branch 0 taken 94207 times.
✓ Branch 1 taken 92069 times.
✓ Branch 2 taken 15597 times.
✓ Branch 3 taken 239 times.
|
202112 | uint64_t pixel1 = vgetq_lane_u32(in_range, 1) |
118 | 109804 | ? src_rows[vgetq_lane_u32(indices, 1)] | |
119 | 92308 | : border_value; | |
120 |
4/4✓ Branch 0 taken 93784 times.
✓ Branch 1 taken 92492 times.
✓ Branch 2 taken 15604 times.
✓ Branch 3 taken 232 times.
|
202112 | uint64_t pixel2 = vgetq_lane_u32(in_range, 2) |
121 | 109388 | ? src_rows[vgetq_lane_u32(indices, 2)] | |
122 | 92724 | : border_value; | |
123 |
4/4✓ Branch 0 taken 93215 times.
✓ Branch 1 taken 93061 times.
✓ Branch 2 taken 15589 times.
✓ Branch 3 taken 247 times.
|
202112 | uint64_t pixel3 = vgetq_lane_u32(in_range, 3) |
124 | 108804 | ? src_rows[vgetq_lane_u32(indices, 3)] | |
125 | 93308 | : border_value; | |
126 | 202112 | uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
127 | 202112 | rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
128 | 202112 | rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1); | |
129 | 404224 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
130 | 202112 | } | |
131 | } | ||
132 | |||
133 | template <typename ScalarType, bool IsLarge> | ||
134 | 32584 | float32x4x2_t inline load_xy_or_border_2ch(uint32x4_t x, uint32x4_t y, | |
135 | uint32x4_t in_range, | ||
136 | const ScalarType* border_values, | ||
137 | uint32x4_t v_src_stride, | ||
138 | Rows<const ScalarType> src_rows) { | ||
139 | 32584 | const size_t kBytes = 2 * sizeof(ScalarType); | |
140 | 32584 | const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{}; | |
141 | 32584 | ScalarType elements[4 * 2]; // 4 pixels, 2 channels | |
142 | // Multiply x with the number of channels | ||
143 | 32584 | x = vshlq_n_u32(x, 1); | |
144 | if constexpr (IsLarge) { | ||
145 | 576 | uint64x2_t indices_low = | |
146 | 576 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
147 | 288 | vget_low_u32(v_src_stride)); | |
148 | 576 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
149 | 288 | vget_low_u32(v_src_stride)); | |
150 |
4/4✓ Branch 0 taken 62 times.
✓ Branch 1 taken 130 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
|
288 | pixel0 = vgetq_lane_u32(in_range, 0) |
151 | 86 | ? &src_rows[vgetq_lane_u64(indices_low, 0)] | |
152 | 202 | : border_values; | |
153 |
4/4✓ Branch 0 taken 37 times.
✓ Branch 1 taken 155 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
|
288 | pixel1 = vgetq_lane_u32(in_range, 1) |
154 | 61 | ? &src_rows[vgetq_lane_u64(indices_low, 1)] | |
155 | 227 | : border_values; | |
156 |
4/4✓ Branch 0 taken 38 times.
✓ Branch 1 taken 154 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 82 times.
|
288 | pixel2 = vgetq_lane_u32(in_range, 2) |
157 | 52 | ? &src_rows[vgetq_lane_u64(indices_high, 0)] | |
158 | 236 | : border_values; | |
159 |
4/4✓ Branch 0 taken 46 times.
✓ Branch 1 taken 146 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
|
288 | pixel3 = vgetq_lane_u32(in_range, 3) |
160 | 70 | ? &src_rows[vgetq_lane_u64(indices_high, 1)] | |
161 | 218 | : border_values; | |
162 | 288 | } else { | |
163 | 32296 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
164 |
4/4✓ Branch 0 taken 15854 times.
✓ Branch 1 taken 510 times.
✓ Branch 2 taken 15680 times.
✓ Branch 3 taken 252 times.
|
32296 | pixel0 = vgetq_lane_u32(in_range, 0) ? &src_rows[vgetq_lane_u32(indices, 0)] |
165 | 762 | : border_values; | |
166 |
4/4✓ Branch 0 taken 15807 times.
✓ Branch 1 taken 557 times.
✓ Branch 2 taken 15668 times.
✓ Branch 3 taken 264 times.
|
32296 | pixel1 = vgetq_lane_u32(in_range, 1) ? &src_rows[vgetq_lane_u32(indices, 1)] |
167 | 821 | : border_values; | |
168 |
4/4✓ Branch 0 taken 15852 times.
✓ Branch 1 taken 512 times.
✓ Branch 2 taken 15675 times.
✓ Branch 3 taken 257 times.
|
32296 | pixel2 = vgetq_lane_u32(in_range, 2) ? &src_rows[vgetq_lane_u32(indices, 2)] |
169 | 769 | : border_values; | |
170 |
4/4✓ Branch 0 taken 15823 times.
✓ Branch 1 taken 541 times.
✓ Branch 2 taken 15653 times.
✓ Branch 3 taken 279 times.
|
32296 | pixel3 = vgetq_lane_u32(in_range, 3) ? &src_rows[vgetq_lane_u32(indices, 3)] |
171 | 820 | : border_values; | |
172 | 32296 | } | |
173 | 32584 | memcpy(&elements[0], pixel0, kBytes); | |
174 | 32584 | memcpy(&elements[2], pixel1, kBytes); | |
175 | 32584 | memcpy(&elements[4], pixel2, kBytes); | |
176 | 32584 | memcpy(&elements[6], pixel3, kBytes); | |
177 | 32584 | uint16x8_t pixels16{}; | |
178 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
179 | 16556 | pixels16 = vmovl_u8(vld1_u8(elements)); | |
180 | } else if constexpr (std::is_same<ScalarType, uint16_t>::value) { | ||
181 | 16028 | pixels16 = vld1q_u16(elements); | |
182 | } | ||
183 | float32x4x2_t result; | ||
184 | 32584 | result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16))); | |
185 | 32584 | result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16)); | |
186 | return result; | ||
187 | 32584 | } | |
188 | |||
189 | template <typename ScalarType, bool IsLarge> | ||
190 | 98753 | void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax, | |
191 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
192 | Rows<const ScalarType> src_rows, | ||
193 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
194 | float32x4_t& a, float32x4_t& b, float32x4_t& c, | ||
195 | float32x4_t& d) { | ||
196 | 98753 | auto&& [xf, yf] = xy; | |
197 | // Truncating convert to int | ||
198 | 98753 | uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); | |
199 | 98753 | uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); | |
200 | |||
201 | // Get fractional part, or 0 if out of range | ||
202 | 98753 | float32x4_t zero = vdupq_n_f32(0.F); | |
203 | 98753 | uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); | |
204 | 98753 | uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); | |
205 | 98753 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
206 | 98753 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
207 | |||
208 | // x1 = x0 + 1, except if it's already xmax or out of range | ||
209 | 98753 | uint32x4_t x1 = vsubq_u32(x0, x_in_range); | |
210 | 98753 | uint32x4_t y1 = vsubq_u32(y0, y_in_range); | |
211 | |||
212 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
213 | 98753 | a = load_xy<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows); | |
214 | 98753 | b = load_xy<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows); | |
215 | 98753 | c = load_xy<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows); | |
216 | 98753 | d = load_xy<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows); | |
217 | 98753 | } | |
218 | |||
219 | template <typename ScalarType, bool IsLarge> | ||
220 | 8146 | void load_quad_pixels_replicate_2ch(FloatVectorPair xy, uint32x4_t v_xmax, | |
221 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
222 | Rows<const ScalarType> src_rows, | ||
223 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
224 | float32x4x2_t& a, float32x4x2_t& b, | ||
225 | float32x4x2_t& c, float32x4x2_t& d) { | ||
226 | 8146 | auto&& [xf, yf] = xy; | |
227 | // Truncating convert to int | ||
228 | 8146 | uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); | |
229 | 8146 | uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); | |
230 | |||
231 | // Get fractional part, or 0 if out of range | ||
232 | 8146 | float32x4_t zero = vdupq_n_f32(0.F); | |
233 | 8146 | uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); | |
234 | 8146 | uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); | |
235 | 8146 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
236 | 8146 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
237 | |||
238 | // x1 = x0 + 1, except if it's already xmax or out of range | ||
239 | 8146 | uint32x4_t x1 = vsubq_u32(x0, x_in_range); | |
240 | 8146 | uint32x4_t y1 = vsubq_u32(y0, y_in_range); | |
241 | |||
242 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
243 | 8146 | a = load_xy_2ch<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows); | |
244 | 8146 | b = load_xy_2ch<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows); | |
245 | 8146 | c = load_xy_2ch<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows); | |
246 | 8146 | d = load_xy_2ch<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows); | |
247 | 8146 | } | |
248 | |||
249 | template <typename ScalarType, bool IsLarge> | ||
250 | 50726 | void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax, | |
251 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
252 | const ScalarType* border_values, | ||
253 | Rows<const ScalarType> src_rows, | ||
254 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
255 | float32x4_t& a, float32x4_t& b, float32x4_t& c, | ||
256 | float32x4_t& d) { | ||
257 | 50726 | auto&& [xf, yf] = xy; | |
258 | // Convert coordinates to integers, truncating towards minus infinity. | ||
259 | // Negative numbers will become large positive numbers. | ||
260 | // Since the source width and height is known to be <=2^24 these large | ||
261 | // positive numbers will always be treated as outside the source image | ||
262 | // bounds. | ||
263 | 50726 | uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf)); | |
264 | 50726 | uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf)); | |
265 | 50726 | uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1)); | |
266 | 50726 | uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1)); | |
267 | 50726 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
268 | 50726 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
269 | 50726 | uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range; | |
270 | { | ||
271 | 50726 | uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax); | |
272 | 50726 | uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax); | |
273 | 50726 | uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax); | |
274 | 50726 | uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax); | |
275 | 50726 | a_in_range = vandq(x0_in_range, y0_in_range); | |
276 | 50726 | b_in_range = vandq(x1_in_range, y0_in_range); | |
277 | 50726 | c_in_range = vandq(x0_in_range, y1_in_range); | |
278 | 50726 | d_in_range = vandq(x1_in_range, y1_in_range); | |
279 | 50726 | } | |
280 | 50726 | a = load_xy_or_border<ScalarType, IsLarge>( | |
281 | 50726 | x0, y0, a_in_range, border_values[0], v_src_stride, src_rows); | |
282 | 50726 | b = load_xy_or_border<ScalarType, IsLarge>( | |
283 | 50726 | x1, y0, b_in_range, border_values[0], v_src_stride, src_rows); | |
284 | 50726 | c = load_xy_or_border<ScalarType, IsLarge>( | |
285 | 50726 | x0, y1, c_in_range, border_values[0], v_src_stride, src_rows); | |
286 | 50726 | d = load_xy_or_border<ScalarType, IsLarge>( | |
287 | 50726 | x1, y1, d_in_range, border_values[0], v_src_stride, src_rows); | |
288 | 50726 | } | |
289 | |||
290 | template <typename ScalarType, bool IsLarge> | ||
291 | 8146 | void load_quad_pixels_constant_2ch(FloatVectorPair xy, uint32x4_t v_xmax, | |
292 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
293 | const ScalarType* border_values, | ||
294 | Rows<const ScalarType> src_rows, | ||
295 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
296 | float32x4x2_t& a, float32x4x2_t& b, | ||
297 | float32x4x2_t& c, float32x4x2_t& d) { | ||
298 | 8146 | auto&& [xf, yf] = xy; | |
299 | // Convert coordinates to integers, truncating towards minus infinity. | ||
300 | // Negative numbers will become large positive numbers. | ||
301 | // Since the source width and height is known to be <=2^24 these large | ||
302 | // positive numbers will always be treated as outside the source image | ||
303 | // bounds. | ||
304 | 8146 | uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf)); | |
305 | 8146 | uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf)); | |
306 | 8146 | uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1)); | |
307 | 8146 | uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1)); | |
308 | 8146 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
309 | 8146 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
310 | 8146 | uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range; | |
311 | { | ||
312 | 8146 | uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax); | |
313 | 8146 | uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax); | |
314 | 8146 | uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax); | |
315 | 8146 | uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax); | |
316 | 8146 | a_in_range = vandq(x0_in_range, y0_in_range); | |
317 | 8146 | b_in_range = vandq(x1_in_range, y0_in_range); | |
318 | 8146 | c_in_range = vandq(x0_in_range, y1_in_range); | |
319 | 8146 | d_in_range = vandq(x1_in_range, y1_in_range); | |
320 | 8146 | } | |
321 | 16292 | a = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
322 | 8146 | x0, y0, a_in_range, border_values, v_src_stride, src_rows); | |
323 | 16292 | b = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
324 | 8146 | x1, y0, b_in_range, border_values, v_src_stride, src_rows); | |
325 | 16292 | c = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
326 | 8146 | x0, y1, c_in_range, border_values, v_src_stride, src_rows); | |
327 | 16292 | d = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
328 | 8146 | x1, y1, d_in_range, border_values, v_src_stride, src_rows); | |
329 | 8146 | } | |
330 | |||
331 | 182063 | inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a, | |
332 | float32x4_t b, float32x4_t c, float32x4_t d) { | ||
333 | 182063 | float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); | |
334 | 182063 | float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); | |
335 | 182063 | float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); | |
336 | 364126 | return vcvtaq_u32_f32(result); | |
337 | 182063 | } | |
338 | |||
339 | template <typename ScalarType, bool IsLarge, size_t Channels> | ||
340 | 83168 | void transform_pixels_replicate(float32x4_t xf, float32x4_t yf, | |
341 | uint32x4_t v_xmax, uint32x4_t v_ymax, | ||
342 | uint32x4_t v_src_element_stride, | ||
343 | Rows<const ScalarType> src_rows, | ||
344 | Columns<ScalarType> dst) { | ||
345 | // Round to nearest, with Ties To Away (i.e. round 0.5 up) | ||
346 | // Clamp coordinates to within the dimensions of the source image | ||
347 | // (vcvtaq already converted negative values to 0) | ||
348 | 83168 | uint32x4_t x = vminq_u32(vcvtaq_u32_f32(xf), v_xmax); | |
349 | 83168 | uint32x4_t y = vminq_u32(vcvtaq_u32_f32(yf), v_ymax); | |
350 | if constexpr (Channels == 2) { | ||
351 | // Multiply x with the number of channels | ||
352 | 394 | x = vshlq_n_u32(x, 1); | |
353 | } | ||
354 | // Copy pixels from source | ||
355 | if constexpr (IsLarge) { | ||
356 | 792 | uint64x2_t indices_low = | |
357 | 792 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
358 | 396 | vget_low_u32(v_src_element_stride)); | |
359 | 792 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
360 | 396 | vget_low_u32(v_src_element_stride)); | |
361 | if constexpr (Channels == 1) { | ||
362 | 324 | dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)]; | |
363 | 324 | dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)]; | |
364 | 324 | dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)]; | |
365 | 324 | dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)]; | |
366 | } else { | ||
367 | 72 | const size_t kBytes = Channels * sizeof(ScalarType); | |
368 | 72 | memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes); | |
369 | 72 | memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes); | |
370 | 72 | memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes); | |
371 | 72 | memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes); | |
372 | 72 | } | |
373 | 396 | } else { | |
374 | 82772 | uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); | |
375 | if constexpr (Channels == 1) { | ||
376 | 82450 | dst[0] = src_rows[vgetq_lane_u32(indices, 0)]; | |
377 | 82450 | dst[1] = src_rows[vgetq_lane_u32(indices, 1)]; | |
378 | 82450 | dst[2] = src_rows[vgetq_lane_u32(indices, 2)]; | |
379 | 82450 | dst[3] = src_rows[vgetq_lane_u32(indices, 3)]; | |
380 | } else { | ||
381 | 322 | const size_t kBytes = Channels * sizeof(ScalarType); | |
382 | 322 | memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u32(indices, 0)], kBytes); | |
383 | 322 | memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u32(indices, 1)], kBytes); | |
384 | 322 | memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u32(indices, 2)], kBytes); | |
385 | 322 | memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u32(indices, 3)], kBytes); | |
386 | 322 | } | |
387 | 82772 | } | |
388 | 83168 | } | |
389 | |||
390 | template <size_t Lane, typename ScalarType> | ||
391 | 155988 | static const ScalarType* get_src_or_border_small( | |
392 | uint32x4_t in_range, Rows<const ScalarType> src_rows, uint32x4_t indices, | ||
393 | const ScalarType* border_values) { | ||
394 |
16/16✓ Branch 0 taken 32786 times.
✓ Branch 1 taken 6021 times.
✓ Branch 2 taken 32763 times.
✓ Branch 3 taken 6044 times.
✓ Branch 4 taken 32748 times.
✓ Branch 5 taken 6059 times.
✓ Branch 6 taken 32710 times.
✓ Branch 7 taken 6097 times.
✓ Branch 8 taken 90 times.
✓ Branch 9 taken 100 times.
✓ Branch 10 taken 86 times.
✓ Branch 11 taken 104 times.
✓ Branch 12 taken 90 times.
✓ Branch 13 taken 100 times.
✓ Branch 14 taken 88 times.
✓ Branch 15 taken 102 times.
|
155988 | return vgetq_lane_u32(in_range, Lane) |
395 | 131361 | ? &src_rows[vgetq_lane_u32(indices, Lane)] | |
396 | 24627 | : border_values; | |
397 | } | ||
398 | |||
399 | template <size_t Lane, typename ScalarType> | ||
400 | 1080 | static const ScalarType* get_src_or_border_large( | |
401 | uint32x4_t in_range, Rows<const ScalarType> src_rows, uint64x2_t indices, | ||
402 | const ScalarType* border_values) { | ||
403 |
16/16✓ Branch 0 taken 64 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 168 times.
✓ Branch 4 taken 40 times.
✓ Branch 5 taken 182 times.
✓ Branch 6 taken 54 times.
✓ Branch 7 taken 168 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 34 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 30 times.
✓ Branch 12 taken 10 times.
✓ Branch 13 taken 38 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 36 times.
|
1080 | return vgetq_lane_u32(in_range, Lane) |
404 | 266 | ? &src_rows[vgetq_lane_u64(indices, Lane % 2)] | |
405 | 814 | : border_values; | |
406 | } | ||
407 | |||
408 | template <typename ScalarType, bool IsLarge, size_t Channels> | ||
409 | 39267 | void transform_pixels_constant(float32x4_t xf, float32x4_t yf, | |
410 | uint32x4_t v_xmax, uint32x4_t v_ymax, | ||
411 | uint32x4_t v_src_element_stride, | ||
412 | Rows<const ScalarType> src_rows, | ||
413 | Columns<ScalarType> dst, | ||
414 | const ScalarType* border_values) { | ||
415 | // Convert coordinates to integers. | ||
416 | // Negative numbers will become large positive numbers. | ||
417 | // Since the source width and height is known to be <=2^24 these large | ||
418 | // positive numbers will always be treated as outside the source image | ||
419 | // bounds. | ||
420 | 39267 | uint32x4_t x = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf)); | |
421 | 39267 | uint32x4_t y = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf)); | |
422 | 39267 | uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); | |
423 | |||
424 | // Copy pixels from source | ||
425 | if constexpr (Channels == 1) { | ||
426 | if constexpr (IsLarge) { | ||
427 | 396 | uint64x2_t indices_low = | |
428 | 396 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
429 | 198 | vget_low_u32(v_src_element_stride)); | |
430 | 396 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
431 | 198 | vget_low_u32(v_src_element_stride)); | |
432 | 396 | dst[0] = *get_src_or_border_large<0>(in_range, src_rows, indices_low, | |
433 | 198 | border_values); | |
434 | 396 | dst[1] = *get_src_or_border_large<1>(in_range, src_rows, indices_low, | |
435 | 198 | border_values); | |
436 | 396 | dst[2] = *get_src_or_border_large<2>(in_range, src_rows, indices_high, | |
437 | 198 | border_values); | |
438 | 396 | dst[3] = *get_src_or_border_large<3>(in_range, src_rows, indices_high, | |
439 | 198 | border_values); | |
440 | 198 | } else { | |
441 | 38675 | uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); | |
442 | 77350 | dst[0] = *get_src_or_border_small<0>(in_range, src_rows, indices, | |
443 | 38675 | border_values); | |
444 | 77350 | dst[1] = *get_src_or_border_small<1>(in_range, src_rows, indices, | |
445 | 38675 | border_values); | |
446 | 77350 | dst[2] = *get_src_or_border_small<2>(in_range, src_rows, indices, | |
447 | 38675 | border_values); | |
448 | 77350 | dst[3] = *get_src_or_border_small<3>(in_range, src_rows, indices, | |
449 | 38675 | border_values); | |
450 | 38675 | } | |
451 | } else { // Channels > 1 | ||
452 | 394 | const size_t kBytes = Channels * sizeof(ScalarType); | |
453 | 394 | const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{}; | |
454 | // Multiply x with the number of channels | ||
455 | if constexpr (Channels == 2) { | ||
456 | 394 | x = vshlq_n_u32(x, 1); | |
457 | } else { | ||
458 | x = vmulq_n_u32(x, Channels); | ||
459 | } | ||
460 | if constexpr (IsLarge) { | ||
461 | 144 | uint64x2_t indices_low = | |
462 | 144 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
463 | 72 | vget_low_u32(v_src_element_stride)); | |
464 | 144 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
465 | 72 | vget_low_u32(v_src_element_stride)); | |
466 | 144 | pixel0 = get_src_or_border_large<0>(in_range, src_rows, indices_low, | |
467 | 72 | border_values); | |
468 | 144 | pixel1 = get_src_or_border_large<1>(in_range, src_rows, indices_low, | |
469 | 72 | border_values); | |
470 | 144 | pixel2 = get_src_or_border_large<2>(in_range, src_rows, indices_high, | |
471 | 72 | border_values); | |
472 | 144 | pixel3 = get_src_or_border_large<3>(in_range, src_rows, indices_high, | |
473 | 72 | border_values); | |
474 | |||
475 | 72 | } else { | |
476 | 322 | uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); | |
477 | 644 | pixel0 = get_src_or_border_small<0>(in_range, src_rows, indices, | |
478 | 322 | border_values); | |
479 | 644 | pixel1 = get_src_or_border_small<1>(in_range, src_rows, indices, | |
480 | 322 | border_values); | |
481 | 644 | pixel2 = get_src_or_border_small<2>(in_range, src_rows, indices, | |
482 | 322 | border_values); | |
483 | 644 | pixel3 = get_src_or_border_small<3>(in_range, src_rows, indices, | |
484 | 322 | border_values); | |
485 | 322 | } | |
486 | 394 | memcpy(dst.ptr_at(0), pixel0, kBytes); | |
487 | 394 | memcpy(dst.ptr_at(1), pixel1, kBytes); | |
488 | 394 | memcpy(dst.ptr_at(2), pixel2, kBytes); | |
489 | 394 | memcpy(dst.ptr_at(3), pixel3, kBytes); | |
490 | 394 | } | |
491 | 39267 | } | |
492 | |||
493 | template <typename ScalarType, bool IsLarge, size_t Channels, | ||
494 | kleidicv_border_type_t Border> | ||
495 | 1432 | void transform_pixels(float32x4_t xf, float32x4_t yf, uint32x4_t v_xmax, | |
496 | uint32x4_t v_ymax, uint32x4_t v_src_element_stride, | ||
497 | Rows<const ScalarType> src_rows, Columns<ScalarType> dst, | ||
498 | [[maybe_unused]] const ScalarType* border_values) { | ||
499 | if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { | ||
500 | 716 | transform_pixels_replicate<ScalarType, IsLarge, Channels>( | |
501 | 716 | xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst); | |
502 | } else { | ||
503 | static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); | ||
504 | 716 | transform_pixels_constant<ScalarType, IsLarge, Channels>( | |
505 | 716 | xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst, | |
506 | 716 | border_values); | |
507 | } | ||
508 | 1432 | } | |
509 | |||
510 | } // namespace kleidicv::neon | ||
511 |