KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/transform_neon.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 337 337 100.0%
Functions: 73 73 100.0%
Branches: 96 96 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/neon.h"
7 #include "kleidicv/types.h"
8 #include "transform_common.h"
9
10 namespace kleidicv::neon {
11
12 typedef struct {
13 float32x4_t x, y;
14 } FloatVectorPair;
15
16 template <typename ScalarType, bool IsLarge>
17 395012 float32x4_t inline load_xy(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride,
18 Rows<const ScalarType>& src_rows) {
19 if constexpr (IsLarge) {
20 2592 uint64x2_t indices_low =
21 2592 vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
22 1296 vget_low_u32(v_src_stride));
23 2592 uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
24 1296 vget_low_u32(v_src_stride));
25 2592 uint64_t acc =
26 2592 static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 0)]) |
27 1296 (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 1)]) << 32);
28 1296 uint64x2_t rawsrc = vdupq_n_u64(acc);
29 2592 acc = static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 0)]) |
30 1296 (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 1)])
31 1296 << 32);
32 1296 rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
33 2592 return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
34 1296 } else {
35 393716 uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
36 787432 uint64_t acc =
37 787432 static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 0)]) |
38 393716 (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 1)]) << 32);
39 393716 uint64x2_t rawsrc = vdupq_n_u64(acc);
40 787432 acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 2)]) |
41 393716 (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 3)]) << 32);
42 393716 rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
43 787432 return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
44 393716 }
45 }
46
47 template <typename ScalarType, bool IsLarge>
48 32584 float32x4x2_t inline load_xy_2ch(uint32x4_t x, uint32x4_t y,
49 uint32x4_t v_src_stride,
50 Rows<const ScalarType>& src_rows) {
51 32584 const size_t kBytes = 2 * sizeof(ScalarType);
52 32584 ScalarType elements[4 * 2]; // 4 pixels, 2 channels
53 // Multiply x with the number of channels (2)
54 32584 x = vshlq_n_u32(x, 1);
55 if constexpr (IsLarge) {
56 576 uint64x2_t indices_low =
57 576 vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
58 288 vget_low_u32(v_src_stride));
59 576 uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
60 288 vget_low_u32(v_src_stride));
61 288 memcpy(&elements[0], &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes);
62 288 memcpy(&elements[2], &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes);
63 288 memcpy(&elements[4], &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes);
64 288 memcpy(&elements[6], &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes);
65 288 } else {
66 32296 uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
67 32296 memcpy(&elements[0], &src_rows[vgetq_lane_u32(indices, 0)], kBytes);
68 32296 memcpy(&elements[2], &src_rows[vgetq_lane_u32(indices, 1)], kBytes);
69 32296 memcpy(&elements[4], &src_rows[vgetq_lane_u32(indices, 2)], kBytes);
70 32296 memcpy(&elements[6], &src_rows[vgetq_lane_u32(indices, 3)], kBytes);
71 32296 }
72 32584 uint16x8_t pixels16{};
73 if constexpr (std::is_same<ScalarType, uint8_t>::value) {
74 16556 pixels16 = vmovl_u8(vld1_u8(elements));
75 } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
76 16028 pixels16 = vld1q_u16(elements);
77 }
78 float32x4x2_t result;
79 32584 result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16)));
80 32584 result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16));
81 return result;
82 32584 }
83
84 template <typename ScalarType, bool IsLarge>
85 202904 float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y,
86 uint32x4_t in_range,
87 ScalarType border_value,
88 uint32x4_t v_src_stride,
89 Rows<const ScalarType> src_rows) {
90 if constexpr (IsLarge) {
91 1584 uint64x2_t indices_low =
92 1584 vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
93 792 vget_low_u32(v_src_stride));
94 1584 uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
95 792 vget_low_u32(v_src_stride));
96
4/4
✓ Branch 0 taken 554 times.
✓ Branch 1 taken 142 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
792 uint64_t pixel0 = vgetq_lane_u32(in_range, 0)
97 578 ? src_rows[vgetq_lane_u64(indices_low, 0)]
98 214 : border_value;
99
4/4
✓ Branch 0 taken 529 times.
✓ Branch 1 taken 167 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
792 uint64_t pixel1 = vgetq_lane_u32(in_range, 1)
100 553 ? src_rows[vgetq_lane_u64(indices_low, 1)]
101 239 : border_value;
102
4/4
✓ Branch 0 taken 530 times.
✓ Branch 1 taken 166 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 82 times.
792 uint64_t pixel2 = vgetq_lane_u32(in_range, 2)
103 544 ? src_rows[vgetq_lane_u64(indices_high, 0)]
104 248 : border_value;
105
4/4
✓ Branch 0 taken 538 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
792 uint64_t pixel3 = vgetq_lane_u32(in_range, 3)
106 562 ? src_rows[vgetq_lane_u64(indices_high, 1)]
107 230 : border_value;
108 792 uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
109 792 rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
110 792 rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1);
111 1584 return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
112 792 } else {
113 202112 uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
114
4/4
✓ Branch 0 taken 94757 times.
✓ Branch 1 taken 91519 times.
✓ Branch 2 taken 15609 times.
✓ Branch 3 taken 227 times.
202112 uint64_t pixel0 = vgetq_lane_u32(in_range, 0)
115 110366 ? src_rows[vgetq_lane_u32(indices, 0)]
116 91746 : border_value;
117
4/4
✓ Branch 0 taken 94207 times.
✓ Branch 1 taken 92069 times.
✓ Branch 2 taken 15597 times.
✓ Branch 3 taken 239 times.
202112 uint64_t pixel1 = vgetq_lane_u32(in_range, 1)
118 109804 ? src_rows[vgetq_lane_u32(indices, 1)]
119 92308 : border_value;
120
4/4
✓ Branch 0 taken 93784 times.
✓ Branch 1 taken 92492 times.
✓ Branch 2 taken 15604 times.
✓ Branch 3 taken 232 times.
202112 uint64_t pixel2 = vgetq_lane_u32(in_range, 2)
121 109388 ? src_rows[vgetq_lane_u32(indices, 2)]
122 92724 : border_value;
123
4/4
✓ Branch 0 taken 93215 times.
✓ Branch 1 taken 93061 times.
✓ Branch 2 taken 15589 times.
✓ Branch 3 taken 247 times.
202112 uint64_t pixel3 = vgetq_lane_u32(in_range, 3)
124 108804 ? src_rows[vgetq_lane_u32(indices, 3)]
125 93308 : border_value;
126 202112 uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
127 202112 rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
128 202112 rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1);
129 404224 return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
130 202112 }
131 }
132
133 template <typename ScalarType, bool IsLarge>
134 32584 float32x4x2_t inline load_xy_or_border_2ch(uint32x4_t x, uint32x4_t y,
135 uint32x4_t in_range,
136 const ScalarType* border_values,
137 uint32x4_t v_src_stride,
138 Rows<const ScalarType> src_rows) {
139 32584 const size_t kBytes = 2 * sizeof(ScalarType);
140 32584 const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{};
141 32584 ScalarType elements[4 * 2]; // 4 pixels, 2 channels
142 // Multiply x with the number of channels
143 32584 x = vshlq_n_u32(x, 1);
144 if constexpr (IsLarge) {
145 576 uint64x2_t indices_low =
146 576 vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
147 288 vget_low_u32(v_src_stride));
148 576 uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
149 288 vget_low_u32(v_src_stride));
150
4/4
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 130 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
288 pixel0 = vgetq_lane_u32(in_range, 0)
151 86 ? &src_rows[vgetq_lane_u64(indices_low, 0)]
152 202 : border_values;
153
4/4
✓ Branch 0 taken 37 times.
✓ Branch 1 taken 155 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
288 pixel1 = vgetq_lane_u32(in_range, 1)
154 61 ? &src_rows[vgetq_lane_u64(indices_low, 1)]
155 227 : border_values;
156
4/4
✓ Branch 0 taken 38 times.
✓ Branch 1 taken 154 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 82 times.
288 pixel2 = vgetq_lane_u32(in_range, 2)
157 52 ? &src_rows[vgetq_lane_u64(indices_high, 0)]
158 236 : border_values;
159
4/4
✓ Branch 0 taken 46 times.
✓ Branch 1 taken 146 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 72 times.
288 pixel3 = vgetq_lane_u32(in_range, 3)
160 70 ? &src_rows[vgetq_lane_u64(indices_high, 1)]
161 218 : border_values;
162 288 } else {
163 32296 uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
164
4/4
✓ Branch 0 taken 15854 times.
✓ Branch 1 taken 510 times.
✓ Branch 2 taken 15680 times.
✓ Branch 3 taken 252 times.
32296 pixel0 = vgetq_lane_u32(in_range, 0) ? &src_rows[vgetq_lane_u32(indices, 0)]
165 762 : border_values;
166
4/4
✓ Branch 0 taken 15807 times.
✓ Branch 1 taken 557 times.
✓ Branch 2 taken 15668 times.
✓ Branch 3 taken 264 times.
32296 pixel1 = vgetq_lane_u32(in_range, 1) ? &src_rows[vgetq_lane_u32(indices, 1)]
167 821 : border_values;
168
4/4
✓ Branch 0 taken 15852 times.
✓ Branch 1 taken 512 times.
✓ Branch 2 taken 15675 times.
✓ Branch 3 taken 257 times.
32296 pixel2 = vgetq_lane_u32(in_range, 2) ? &src_rows[vgetq_lane_u32(indices, 2)]
169 769 : border_values;
170
4/4
✓ Branch 0 taken 15823 times.
✓ Branch 1 taken 541 times.
✓ Branch 2 taken 15653 times.
✓ Branch 3 taken 279 times.
32296 pixel3 = vgetq_lane_u32(in_range, 3) ? &src_rows[vgetq_lane_u32(indices, 3)]
171 820 : border_values;
172 32296 }
173 32584 memcpy(&elements[0], pixel0, kBytes);
174 32584 memcpy(&elements[2], pixel1, kBytes);
175 32584 memcpy(&elements[4], pixel2, kBytes);
176 32584 memcpy(&elements[6], pixel3, kBytes);
177 32584 uint16x8_t pixels16{};
178 if constexpr (std::is_same<ScalarType, uint8_t>::value) {
179 16556 pixels16 = vmovl_u8(vld1_u8(elements));
180 } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
181 16028 pixels16 = vld1q_u16(elements);
182 }
183 float32x4x2_t result;
184 32584 result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16)));
185 32584 result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16));
186 return result;
187 32584 }
188
189 template <typename ScalarType, bool IsLarge>
190 98753 void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax,
191 uint32x4_t v_ymax, uint32x4_t v_src_stride,
192 Rows<const ScalarType> src_rows,
193 float32x4_t& xfrac, float32x4_t& yfrac,
194 float32x4_t& a, float32x4_t& b, float32x4_t& c,
195 float32x4_t& d) {
196 98753 auto&& [xf, yf] = xy;
197 // Truncating convert to int
198 98753 uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax);
199 98753 uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax);
200
201 // Get fractional part, or 0 if out of range
202 98753 float32x4_t zero = vdupq_n_f32(0.F);
203 98753 uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax));
204 98753 uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax));
205 98753 xfrac = vsubq_f32(xf, vrndmq_f32(xf));
206 98753 yfrac = vsubq_f32(yf, vrndmq_f32(yf));
207
208 // x1 = x0 + 1, except if it's already xmax or out of range
209 98753 uint32x4_t x1 = vsubq_u32(x0, x_in_range);
210 98753 uint32x4_t y1 = vsubq_u32(y0, y_in_range);
211
212 // a: top left, b: top right, c: bottom left, d: bottom right
213 98753 a = load_xy<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows);
214 98753 b = load_xy<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows);
215 98753 c = load_xy<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows);
216 98753 d = load_xy<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows);
217 98753 }
218
219 template <typename ScalarType, bool IsLarge>
220 8146 void load_quad_pixels_replicate_2ch(FloatVectorPair xy, uint32x4_t v_xmax,
221 uint32x4_t v_ymax, uint32x4_t v_src_stride,
222 Rows<const ScalarType> src_rows,
223 float32x4_t& xfrac, float32x4_t& yfrac,
224 float32x4x2_t& a, float32x4x2_t& b,
225 float32x4x2_t& c, float32x4x2_t& d) {
226 8146 auto&& [xf, yf] = xy;
227 // Truncating convert to int
228 8146 uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax);
229 8146 uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax);
230
231 // Get fractional part, or 0 if out of range
232 8146 float32x4_t zero = vdupq_n_f32(0.F);
233 8146 uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax));
234 8146 uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax));
235 8146 xfrac = vsubq_f32(xf, vrndmq_f32(xf));
236 8146 yfrac = vsubq_f32(yf, vrndmq_f32(yf));
237
238 // x1 = x0 + 1, except if it's already xmax or out of range
239 8146 uint32x4_t x1 = vsubq_u32(x0, x_in_range);
240 8146 uint32x4_t y1 = vsubq_u32(y0, y_in_range);
241
242 // a: top left, b: top right, c: bottom left, d: bottom right
243 8146 a = load_xy_2ch<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows);
244 8146 b = load_xy_2ch<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows);
245 8146 c = load_xy_2ch<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows);
246 8146 d = load_xy_2ch<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows);
247 8146 }
248
249 template <typename ScalarType, bool IsLarge>
250 50726 void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax,
251 uint32x4_t v_ymax, uint32x4_t v_src_stride,
252 const ScalarType* border_values,
253 Rows<const ScalarType> src_rows,
254 float32x4_t& xfrac, float32x4_t& yfrac,
255 float32x4_t& a, float32x4_t& b, float32x4_t& c,
256 float32x4_t& d) {
257 50726 auto&& [xf, yf] = xy;
258 // Convert coordinates to integers, truncating towards minus infinity.
259 // Negative numbers will become large positive numbers.
260 // Since the source width and height is known to be <=2^24 these large
261 // positive numbers will always be treated as outside the source image
262 // bounds.
263 50726 uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf));
264 50726 uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf));
265 50726 uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1));
266 50726 uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1));
267 50726 xfrac = vsubq_f32(xf, vrndmq_f32(xf));
268 50726 yfrac = vsubq_f32(yf, vrndmq_f32(yf));
269 50726 uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range;
270 {
271 50726 uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax);
272 50726 uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax);
273 50726 uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax);
274 50726 uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax);
275 50726 a_in_range = vandq(x0_in_range, y0_in_range);
276 50726 b_in_range = vandq(x1_in_range, y0_in_range);
277 50726 c_in_range = vandq(x0_in_range, y1_in_range);
278 50726 d_in_range = vandq(x1_in_range, y1_in_range);
279 50726 }
280 50726 a = load_xy_or_border<ScalarType, IsLarge>(
281 50726 x0, y0, a_in_range, border_values[0], v_src_stride, src_rows);
282 50726 b = load_xy_or_border<ScalarType, IsLarge>(
283 50726 x1, y0, b_in_range, border_values[0], v_src_stride, src_rows);
284 50726 c = load_xy_or_border<ScalarType, IsLarge>(
285 50726 x0, y1, c_in_range, border_values[0], v_src_stride, src_rows);
286 50726 d = load_xy_or_border<ScalarType, IsLarge>(
287 50726 x1, y1, d_in_range, border_values[0], v_src_stride, src_rows);
288 50726 }
289
290 template <typename ScalarType, bool IsLarge>
291 8146 void load_quad_pixels_constant_2ch(FloatVectorPair xy, uint32x4_t v_xmax,
292 uint32x4_t v_ymax, uint32x4_t v_src_stride,
293 const ScalarType* border_values,
294 Rows<const ScalarType> src_rows,
295 float32x4_t& xfrac, float32x4_t& yfrac,
296 float32x4x2_t& a, float32x4x2_t& b,
297 float32x4x2_t& c, float32x4x2_t& d) {
298 8146 auto&& [xf, yf] = xy;
299 // Convert coordinates to integers, truncating towards minus infinity.
300 // Negative numbers will become large positive numbers.
301 // Since the source width and height is known to be <=2^24 these large
302 // positive numbers will always be treated as outside the source image
303 // bounds.
304 8146 uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf));
305 8146 uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf));
306 8146 uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1));
307 8146 uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1));
308 8146 xfrac = vsubq_f32(xf, vrndmq_f32(xf));
309 8146 yfrac = vsubq_f32(yf, vrndmq_f32(yf));
310 8146 uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range;
311 {
312 8146 uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax);
313 8146 uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax);
314 8146 uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax);
315 8146 uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax);
316 8146 a_in_range = vandq(x0_in_range, y0_in_range);
317 8146 b_in_range = vandq(x1_in_range, y0_in_range);
318 8146 c_in_range = vandq(x0_in_range, y1_in_range);
319 8146 d_in_range = vandq(x1_in_range, y1_in_range);
320 8146 }
321 16292 a = load_xy_or_border_2ch<ScalarType, IsLarge>(
322 8146 x0, y0, a_in_range, border_values, v_src_stride, src_rows);
323 16292 b = load_xy_or_border_2ch<ScalarType, IsLarge>(
324 8146 x1, y0, b_in_range, border_values, v_src_stride, src_rows);
325 16292 c = load_xy_or_border_2ch<ScalarType, IsLarge>(
326 8146 x0, y1, c_in_range, border_values, v_src_stride, src_rows);
327 16292 d = load_xy_or_border_2ch<ScalarType, IsLarge>(
328 8146 x1, y1, d_in_range, border_values, v_src_stride, src_rows);
329 8146 }
330
331 182063 inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a,
332 float32x4_t b, float32x4_t c, float32x4_t d) {
333 182063 float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
334 182063 float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
335 182063 float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
336 364126 return vcvtaq_u32_f32(result);
337 182063 }
338
339 template <typename ScalarType, bool IsLarge, size_t Channels>
340 83168 void transform_pixels_replicate(float32x4_t xf, float32x4_t yf,
341 uint32x4_t v_xmax, uint32x4_t v_ymax,
342 uint32x4_t v_src_element_stride,
343 Rows<const ScalarType> src_rows,
344 Columns<ScalarType> dst) {
345 // Round to nearest, with Ties To Away (i.e. round 0.5 up)
346 // Clamp coordinates to within the dimensions of the source image
347 // (vcvtaq already converted negative values to 0)
348 83168 uint32x4_t x = vminq_u32(vcvtaq_u32_f32(xf), v_xmax);
349 83168 uint32x4_t y = vminq_u32(vcvtaq_u32_f32(yf), v_ymax);
350 if constexpr (Channels == 2) {
351 // Multiply x with the number of channels
352 394 x = vshlq_n_u32(x, 1);
353 }
354 // Copy pixels from source
355 if constexpr (IsLarge) {
356 792 uint64x2_t indices_low =
357 792 vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
358 396 vget_low_u32(v_src_element_stride));
359 792 uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
360 396 vget_low_u32(v_src_element_stride));
361 if constexpr (Channels == 1) {
362 324 dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)];
363 324 dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)];
364 324 dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)];
365 324 dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)];
366 } else {
367 72 const size_t kBytes = Channels * sizeof(ScalarType);
368 72 memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes);
369 72 memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes);
370 72 memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes);
371 72 memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes);
372 72 }
373 396 } else {
374 82772 uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride);
375 if constexpr (Channels == 1) {
376 82450 dst[0] = src_rows[vgetq_lane_u32(indices, 0)];
377 82450 dst[1] = src_rows[vgetq_lane_u32(indices, 1)];
378 82450 dst[2] = src_rows[vgetq_lane_u32(indices, 2)];
379 82450 dst[3] = src_rows[vgetq_lane_u32(indices, 3)];
380 } else {
381 322 const size_t kBytes = Channels * sizeof(ScalarType);
382 322 memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u32(indices, 0)], kBytes);
383 322 memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u32(indices, 1)], kBytes);
384 322 memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u32(indices, 2)], kBytes);
385 322 memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u32(indices, 3)], kBytes);
386 322 }
387 82772 }
388 83168 }
389
390 template <size_t Lane, typename ScalarType>
391 155988 static const ScalarType* get_src_or_border_small(
392 uint32x4_t in_range, Rows<const ScalarType> src_rows, uint32x4_t indices,
393 const ScalarType* border_values) {
394
16/16
✓ Branch 0 taken 32786 times.
✓ Branch 1 taken 6021 times.
✓ Branch 2 taken 32763 times.
✓ Branch 3 taken 6044 times.
✓ Branch 4 taken 32748 times.
✓ Branch 5 taken 6059 times.
✓ Branch 6 taken 32710 times.
✓ Branch 7 taken 6097 times.
✓ Branch 8 taken 90 times.
✓ Branch 9 taken 100 times.
✓ Branch 10 taken 86 times.
✓ Branch 11 taken 104 times.
✓ Branch 12 taken 90 times.
✓ Branch 13 taken 100 times.
✓ Branch 14 taken 88 times.
✓ Branch 15 taken 102 times.
155988 return vgetq_lane_u32(in_range, Lane)
395 131361 ? &src_rows[vgetq_lane_u32(indices, Lane)]
396 24627 : border_values;
397 }
398
399 template <size_t Lane, typename ScalarType>
400 1080 static const ScalarType* get_src_or_border_large(
401 uint32x4_t in_range, Rows<const ScalarType> src_rows, uint64x2_t indices,
402 const ScalarType* border_values) {
403
16/16
✓ Branch 0 taken 64 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 168 times.
✓ Branch 4 taken 40 times.
✓ Branch 5 taken 182 times.
✓ Branch 6 taken 54 times.
✓ Branch 7 taken 168 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 34 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 30 times.
✓ Branch 12 taken 10 times.
✓ Branch 13 taken 38 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 36 times.
1080 return vgetq_lane_u32(in_range, Lane)
404 266 ? &src_rows[vgetq_lane_u64(indices, Lane % 2)]
405 814 : border_values;
406 }
407
408 template <typename ScalarType, bool IsLarge, size_t Channels>
409 39267 void transform_pixels_constant(float32x4_t xf, float32x4_t yf,
410 uint32x4_t v_xmax, uint32x4_t v_ymax,
411 uint32x4_t v_src_element_stride,
412 Rows<const ScalarType> src_rows,
413 Columns<ScalarType> dst,
414 const ScalarType* border_values) {
415 // Convert coordinates to integers.
416 // Negative numbers will become large positive numbers.
417 // Since the source width and height is known to be <=2^24 these large
418 // positive numbers will always be treated as outside the source image
419 // bounds.
420 39267 uint32x4_t x = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf));
421 39267 uint32x4_t y = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf));
422 39267 uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax));
423
424 // Copy pixels from source
425 if constexpr (Channels == 1) {
426 if constexpr (IsLarge) {
427 396 uint64x2_t indices_low =
428 396 vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
429 198 vget_low_u32(v_src_element_stride));
430 396 uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
431 198 vget_low_u32(v_src_element_stride));
432 396 dst[0] = *get_src_or_border_large<0>(in_range, src_rows, indices_low,
433 198 border_values);
434 396 dst[1] = *get_src_or_border_large<1>(in_range, src_rows, indices_low,
435 198 border_values);
436 396 dst[2] = *get_src_or_border_large<2>(in_range, src_rows, indices_high,
437 198 border_values);
438 396 dst[3] = *get_src_or_border_large<3>(in_range, src_rows, indices_high,
439 198 border_values);
440 198 } else {
441 38675 uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride);
442 77350 dst[0] = *get_src_or_border_small<0>(in_range, src_rows, indices,
443 38675 border_values);
444 77350 dst[1] = *get_src_or_border_small<1>(in_range, src_rows, indices,
445 38675 border_values);
446 77350 dst[2] = *get_src_or_border_small<2>(in_range, src_rows, indices,
447 38675 border_values);
448 77350 dst[3] = *get_src_or_border_small<3>(in_range, src_rows, indices,
449 38675 border_values);
450 38675 }
451 } else { // Channels > 1
452 394 const size_t kBytes = Channels * sizeof(ScalarType);
453 394 const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{};
454 // Multiply x with the number of channels
455 if constexpr (Channels == 2) {
456 394 x = vshlq_n_u32(x, 1);
457 } else {
458 x = vmulq_n_u32(x, Channels);
459 }
460 if constexpr (IsLarge) {
461 144 uint64x2_t indices_low =
462 144 vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
463 72 vget_low_u32(v_src_element_stride));
464 144 uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
465 72 vget_low_u32(v_src_element_stride));
466 144 pixel0 = get_src_or_border_large<0>(in_range, src_rows, indices_low,
467 72 border_values);
468 144 pixel1 = get_src_or_border_large<1>(in_range, src_rows, indices_low,
469 72 border_values);
470 144 pixel2 = get_src_or_border_large<2>(in_range, src_rows, indices_high,
471 72 border_values);
472 144 pixel3 = get_src_or_border_large<3>(in_range, src_rows, indices_high,
473 72 border_values);
474
475 72 } else {
476 322 uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride);
477 644 pixel0 = get_src_or_border_small<0>(in_range, src_rows, indices,
478 322 border_values);
479 644 pixel1 = get_src_or_border_small<1>(in_range, src_rows, indices,
480 322 border_values);
481 644 pixel2 = get_src_or_border_small<2>(in_range, src_rows, indices,
482 322 border_values);
483 644 pixel3 = get_src_or_border_small<3>(in_range, src_rows, indices,
484 322 border_values);
485 322 }
486 394 memcpy(dst.ptr_at(0), pixel0, kBytes);
487 394 memcpy(dst.ptr_at(1), pixel1, kBytes);
488 394 memcpy(dst.ptr_at(2), pixel2, kBytes);
489 394 memcpy(dst.ptr_at(3), pixel3, kBytes);
490 394 }
491 39267 }
492
493 template <typename ScalarType, bool IsLarge, size_t Channels,
494 kleidicv_border_type_t Border>
495 1432 void transform_pixels(float32x4_t xf, float32x4_t yf, uint32x4_t v_xmax,
496 uint32x4_t v_ymax, uint32x4_t v_src_element_stride,
497 Rows<const ScalarType> src_rows, Columns<ScalarType> dst,
498 [[maybe_unused]] const ScalarType* border_values) {
499 if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
500 716 transform_pixels_replicate<ScalarType, IsLarge, Channels>(
501 716 xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst);
502 } else {
503 static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
504 716 transform_pixels_constant<ScalarType, IsLarge, Channels>(
505 716 xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst,
506 716 border_values);
507 }
508 1432 }
509
510 } // namespace kleidicv::neon
511