KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/resize/resize_linear_neon.cpp
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 655 656 99.8%
Functions: 38 38 100.0%
Branches: 88 94 93.6%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6
7 #include "kleidicv/neon.h"
8 #include "kleidicv/resize/resize_linear.h"
9
10 namespace kleidicv::neon {
11
12 template <uint8_t P, uint8_t Q, uint8_t Bias, uint8_t Shift>
13 9568 uint8x8_t lerp2d_vector_p_q_q_1(uint8x8_t a, uint8x8_t b, uint8x8_t c,
14 uint8x8_t d) {
15 // b + c
16 9568 uint16x8_t b_c = vaddl_u8(b, c);
17
18 // a * p
19 9568 uint16x8_t ap = vmull_u8(a, vdup_n_u8(P));
20
21 // a * p + (b + c) * q
22 9568 uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q));
23
24 // d + bias
25 9568 uint16x8_t d_bias = vaddl_u8(d, vdup_n_u8(Bias));
26
27 // a * p + (b + c) * q + d + bias
28 9568 uint16x8_t ap_bcq_d_bias = vaddq_u16(ap_bcq, d_bias);
29
30 // (a * p + (b + c) * q + d + bias) >> shift
31 9568 uint8x8_t result = vshrn_n_u16(ap_bcq_d_bias, Shift);
32 19136 return result;
33 9568 }
34
35 template <uint8_t P, uint8_t Q, uint8_t R, uint8_t Bias, uint8_t Shift>
36 4784 uint8x8_t lerp2d_vector_p_q_q_r(uint8x8_t a, uint8x8_t b, uint8x8_t c,
37 uint8x8_t d) {
38 // b + c
39 4784 uint16x8_t b_c = vaddl_u8(b, c);
40
41 // a * p
42 4784 uint16x8_t ap = vmull_u8(a, vdup_n_u8(P));
43
44 // d * r
45 4784 uint16x8_t dr = vmull_u8(d, vdup_n_u8(R));
46
47 // a * p + (b + c) * q
48 4784 uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q));
49
50 // d * r + bias
51 4784 uint16x8_t dr_bias = vaddq_u16(dr, vdupq_n_u16(Bias));
52
53 // a * p + (b + c) * q + d * r + bias
54 4784 uint16x8_t ap_bcq_dr_bias = vaddq_u16(ap_bcq, dr_bias);
55
56 // (a * p + (b + c) * q + d * r + bias) >> shift
57 4784 uint8x8_t result = vshrn_n_u16(ap_bcq_dr_bias, Shift);
58 9568 return result;
59 4784 }
60
61 78 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t kleidicv_resize_2x2_stripe_u8(
62 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
63 size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) {
64 78 size_t dst_width = src_width * 2;
65
66 1358 auto lerp1d_scalar = [](uint8_t near, uint8_t far) {
67 1280 return (near * 3 + far + 2) >> 2;
68 };
69
70 1206 auto lerp1d_vector = [](uint8x8_t near, uint8x8_t far) {
71 1128 uint8x8_t three = vdup_n_u8(3);
72 1128 uint8x8_t two = vdup_n_u8(2);
73
74 // near * 3
75 1128 uint16x8_t near3 = vmull_u8(near, three);
76
77 // far + 2
78 1128 uint16x8_t far_2 = vaddl_u8(far, two);
79
80 // near * 3 + far * 2
81 1128 uint16x8_t near3_far_2 = vaddq_u16(near3, far_2);
82
83 // (near * 3 + far * 2) / 4
84 1128 uint8x8_t near3_far_2_div4 = vshrn_n_u16(near3_far_2, 2);
85
86 2256 return near3_far_2_div4;
87 1128 };
88
89 2358 auto lerp2d_scalar = [](uint8_t near, uint8_t mid_a, uint8_t mid_b,
90 uint8_t far) {
91 2280 return (near * 9 + (mid_a + mid_b) * 3 + far + 8) >> 4;
92 };
93
94 4862 auto lerp2d_vector = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, uint8x8_t d) {
95 4784 return lerp2d_vector_p_q_q_1<9, 3, 8, 4>(a, b, c, d);
96 };
97
98 // Handle top or bottom edge
99 200 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
100 const uint8_t *src_row, uint8_t *dst_row) {
101 // Left element
102 122 dst_row[0] = src_row[0];
103
104 // Right element
105 122 dst_row[dst_width - 1] = src_row[src_width - 1];
106
107 // Middle elements
108 122 size_t src_x = 0;
109
2/2
✓ Branch 0 taken 564 times.
✓ Branch 1 taken 122 times.
686 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
110 564 size_t dst_x = src_x * 2 + 1;
111 564 uint8x8_t src_left = vld1_u8(src_row + src_x);
112 564 uint8x8_t src_right = vld1_u8(src_row + src_x + 1);
113
114 564 uint8x8_t dst_left = lerp1d_vector(src_left, src_right);
115 564 uint8x8_t dst_right = lerp1d_vector(src_right, src_left);
116
117 564 vst2_u8(dst_row + dst_x, (uint8x8x2_t{dst_left, dst_right}));
118 564 }
119
2/2
✓ Branch 0 taken 180 times.
✓ Branch 1 taken 122 times.
302 for (; src_x + 1 < src_width; ++src_x) {
120 180 size_t dst_x = src_x * 2 + 1;
121 180 const uint8_t src_left = src_row[src_x], src_right = src_row[src_x + 1];
122 180 dst_row[dst_x] = lerp1d_scalar(src_left, src_right);
123 180 dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left);
124 180 }
125 122 };
126
127 308 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar,
128 lerp2d_vector](const uint8_t *src_row0,
129 const uint8_t *src_row1, uint8_t *dst_row0,
130 uint8_t *dst_row1) {
131 // Left element
132 230 dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]);
133 230 dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]);
134
135 // Right element
136 230 dst_row0[dst_width - 1] =
137 230 lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]);
138 230 dst_row1[dst_width - 1] =
139 230 lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]);
140
141 // Middle elements
142 230 size_t src_x = 0;
143
2/2
✓ Branch 0 taken 1196 times.
✓ Branch 1 taken 230 times.
1426 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
144 1196 size_t dst_x = src_x * 2 + 1;
145
146 1196 uint8x8_t src_tl = vld1_u8(src_row0 + src_x);
147 1196 uint8x8_t src_tr = vld1_u8(src_row0 + src_x + 1);
148 1196 uint8x8_t src_bl = vld1_u8(src_row1 + src_x);
149 1196 uint8x8_t src_br = vld1_u8(src_row1 + src_x + 1);
150
151 1196 uint8x8_t dst_tl = lerp2d_vector(src_tl, src_tr, src_bl, src_br);
152 1196 uint8x8_t dst_tr = lerp2d_vector(src_tr, src_tl, src_br, src_bl);
153 1196 uint8x8_t dst_bl = lerp2d_vector(src_bl, src_tl, src_br, src_tr);
154 1196 uint8x8_t dst_br = lerp2d_vector(src_br, src_tr, src_bl, src_tl);
155
156 1196 vst2_u8(dst_row0 + dst_x, (uint8x8x2_t{dst_tl, dst_tr}));
157 1196 vst2_u8(dst_row1 + dst_x, (uint8x8x2_t{dst_bl, dst_br}));
158 1196 }
159
2/2
✓ Branch 0 taken 570 times.
✓ Branch 1 taken 230 times.
800 for (; src_x + 1 < src_width; ++src_x) {
160 570 size_t dst_x = src_x * 2 + 1;
161 1140 const uint8_t src_tl = src_row0[src_x], src_tr = src_row0[src_x + 1],
162 1140 src_bl = src_row1[src_x], src_br = src_row1[src_x + 1];
163 570 dst_row0[dst_x] = lerp2d_scalar(src_tl, src_tr, src_bl, src_br);
164 570 dst_row0[dst_x + 1] = lerp2d_scalar(src_tr, src_tl, src_br, src_bl);
165 570 dst_row1[dst_x] = lerp2d_scalar(src_bl, src_tl, src_br, src_tr);
166 570 dst_row1[dst_x + 1] = lerp2d_scalar(src_br, src_tr, src_bl, src_tl);
167 570 }
168 230 };
169
170 // Top row
171
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 61 times.
78 if (KLEIDICV_LIKELY(y_begin == 0)) {
172 61 process_edge_row(src, dst);
173 61 }
174
175 // Middle rows
176
2/2
✓ Branch 0 taken 230 times.
✓ Branch 1 taken 78 times.
308 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
177 230 size_t dst_y = src_y * 2 + 1;
178 230 const uint8_t *src_row0 = src + src_stride * src_y;
179 230 const uint8_t *src_row1 = src_row0 + src_stride;
180 230 uint8_t *dst_row0 = dst + dst_stride * dst_y;
181 230 uint8_t *dst_row1 = dst_row0 + dst_stride;
182
183 230 process_row(src_row0, src_row1, dst_row0, dst_row1);
184 230 }
185
186 // Bottom row
187
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 61 times.
78 if (KLEIDICV_LIKELY(y_end == src_height)) {
188 122 process_edge_row(src + src_stride * (src_height - 1),
189 61 dst + dst_stride * (2 * src_height - 1));
190 61 }
191
192 78 return KLEIDICV_OK;
193 78 }
194
195 58 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t kleidicv_resize_4x4_stripe_u8(
196 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
197 size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) {
198 58 size_t dst_width = src_width * 4, dst_height = src_height * 4;
199
200 2234 auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b,
201 uint8_t b) {
202 2176 return (coeff_a * a + coeff_b * b + 4) >> 3;
203 };
204 2314 auto lerp1d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a,
205 uint8_t coeff_b_scalar, uint8x8_t b) {
206 2256 uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar);
207 2256 uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar);
208 2256 uint16x8_t four = vdupq_n_u16(4);
209
210 // a * coeff_a
211 2256 uint16x8_t a1 = vmull_u8(a, coeff_a);
212
213 // b * coeff_b
214 2256 uint16x8_t b1 = vmull_u8(b, coeff_b);
215
216 // a * coeff_a + b * coeff_b
217 2256 uint16x8_t a1_b1 = vaddq_u16(a1, b1);
218
219 // a * coeff_a + b * coeff_b + 4
220 2256 uint16x8_t a1_b1_4 = vaddq_u16(a1_b1, four);
221
222 // (a * coeff_a + b * coeff_b + 4) / 8
223 2256 uint8x8_t result = vshrn_n_u16(a1_b1_4, 3);
224
225 4512 return result;
226 2256 };
227 8346 auto lerp2d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b,
228 uint8_t b, uint8_t coeff_c, uint8_t c,
229 uint8_t coeff_d, uint8_t d) {
230 8288 return (coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d + 32) >> 6;
231 };
232 9626 auto lerp2d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a,
233 uint8_t coeff_b_scalar, uint8x8_t b,
234 uint8_t coeff_c_scalar, uint8x8_t c,
235 uint8_t coeff_d_scalar, uint8x8_t d) {
236 9568 uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar);
237 9568 uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar);
238 9568 uint8x8_t coeff_c = vdup_n_u8(coeff_c_scalar);
239 9568 uint8x8_t coeff_d = vdup_n_u8(coeff_d_scalar);
240 9568 uint16x8_t thirtytwo = vdupq_n_u16(32);
241
242 // a * coeff_a
243 9568 uint16x8_t a1 = vmull_u8(a, coeff_a);
244
245 // b * coeff_b
246 9568 uint16x8_t b1 = vmull_u8(b, coeff_b);
247
248 // c * coeff_c
249 9568 uint16x8_t c1 = vmull_u8(c, coeff_c);
250
251 // d * coeff_d
252 9568 uint16x8_t d1 = vmull_u8(d, coeff_d);
253
254 // a * coeff_a + b * coeff_b
255 9568 uint16x8_t a1_b1 = vaddq_u16(a1, b1);
256
257 // c * coeff_c + d * coeff_d
258 9568 uint16x8_t c1_d1 = vaddq_u16(c1, d1);
259
260 // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d
261 9568 uint16x8_t a1_b1_c1_d1 = vaddq_u16(a1_b1, c1_d1);
262
263 // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32
264 9568 uint16x8_t a1_b1_c1_d1_32 = vaddq_u16(a1_b1_c1_d1, thirtytwo);
265
266 // (a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32) / 64
267 9568 uint8x8_t result = vshrn_n_u16(a1_b1_c1_d1_32, 6);
268 19136 return result;
269 9568 };
270 // Handle top or bottom edge
271 140 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
272 const uint8_t *src_row, uint8_t *dst_row) {
273 // Left elements
274 82 dst_row[1] = dst_row[0] = src_row[0];
275
276 // Right elements
277 82 dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
278
279 // Middle elements
280 82 size_t src_x = 0;
281
2/2
✓ Branch 0 taken 564 times.
✓ Branch 1 taken 82 times.
646 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
282 564 size_t dst_x = src_x * 4 + 2;
283 564 uint8x8_t a = vld1_u8(src_row + src_x);
284 564 uint8x8_t b = vld1_u8(src_row + src_x + 1);
285 564 uint8x8x4_t interpolated = {
286 1692 lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b),
287 1128 lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)};
288
289 564 vst4_u8(dst_row + dst_x, interpolated);
290 564 }
291
2/2
✓ Branch 0 taken 124 times.
✓ Branch 1 taken 82 times.
206 for (; src_x + 1 < src_width; ++src_x) {
292 124 size_t dst_x = src_x * 4 + 2;
293 124 const uint8_t a = src_row[src_x], b = src_row[src_x + 1];
294 124 dst_row[dst_x + 0] = lerp1d_scalar(7, a, 1, b);
295 124 dst_row[dst_x + 1] = lerp1d_scalar(5, a, 3, b);
296 124 dst_row[dst_x + 2] = lerp1d_scalar(3, a, 5, b);
297 124 dst_row[dst_x + 3] = lerp1d_scalar(1, a, 7, b);
298 124 }
299 82 };
300
301 268 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar,
302 lerp2d_vector](const uint8_t *src_row0,
303 const uint8_t *src_row1, uint8_t *dst_row0,
304 uint8_t *dst_row1, uint8_t *dst_row2,
305 uint8_t *dst_row3) {
306 4994 auto lerp2d_vector_49_7_7_1 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c,
307 uint8x8_t d) {
308 4784 return lerp2d_vector_p_q_q_1<49, 7, 32, 6>(a, b, c, d);
309 };
310 4994 auto lerp2d_vector_25_15_15_9 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c,
311 uint8x8_t d) {
312 4784 return lerp2d_vector_p_q_q_r<25, 15, 9, 32, 6>(a, b, c, d);
313 };
314
315 // Left elements
316 210 const uint8_t s0l = src_row0[0], s1l = src_row1[0];
317 210 dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l);
318 210 dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l);
319 210 dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l);
320 210 dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l);
321
322 // Right elements
323 210 const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1];
324 210 const size_t dr0 = dst_width - 2;
325 210 const size_t dr1 = dst_width - 1;
326 210 dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r);
327 210 dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r);
328 210 dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r);
329 210 dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r);
330
331 // Middle elements
332 210 size_t src_x = 0;
333
2/2
✓ Branch 0 taken 1196 times.
✓ Branch 1 taken 210 times.
1406 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
334 1196 size_t dst_x = src_x * 4 + 2;
335
336 1196 uint8x8_t a = vld1_u8(src_row0 + src_x);
337 1196 uint8x8_t b = vld1_u8(src_row0 + src_x + 1);
338 1196 uint8x8_t c = vld1_u8(src_row1 + src_x);
339 1196 uint8x8_t d = vld1_u8(src_row1 + src_x + 1);
340
341 2392 vst4_u8(dst_row0 + dst_x, (uint8x8x4_t{
342 4784 lerp2d_vector_49_7_7_1(a, b, c, d),
343 1196 lerp2d_vector(35, a, 21, b, 5, c, 3, d),
344 1196 lerp2d_vector(21, a, 35, b, 3, c, 5, d),
345 1196 lerp2d_vector_49_7_7_1(b, a, d, c),
346 }));
347 2392 vst4_u8(dst_row1 + dst_x, (uint8x8x4_t{
348 4784 lerp2d_vector(35, a, 5, b, 21, c, 3, d),
349 1196 lerp2d_vector_25_15_15_9(a, b, c, d),
350 1196 lerp2d_vector_25_15_15_9(b, a, d, c),
351 1196 lerp2d_vector(5, a, 35, b, 3, c, 21, d),
352 }));
353 2392 vst4_u8(dst_row2 + dst_x, (uint8x8x4_t{
354 4784 lerp2d_vector(21, a, 3, b, 35, c, 5, d),
355 1196 lerp2d_vector_25_15_15_9(c, a, d, b),
356 1196 lerp2d_vector_25_15_15_9(d, b, c, a),
357 1196 lerp2d_vector(3, a, 21, b, 5, c, 35, d),
358 }));
359 2392 vst4_u8(dst_row3 + dst_x, (uint8x8x4_t{
360 4784 lerp2d_vector_49_7_7_1(c, a, d, b),
361 1196 lerp2d_vector(5, a, 3, b, 35, c, 21, d),
362 1196 lerp2d_vector(3, a, 5, b, 21, c, 35, d),
363 1196 lerp2d_vector_49_7_7_1(d, b, c, a),
364 }));
365 1196 }
366
2/2
✓ Branch 0 taken 518 times.
✓ Branch 1 taken 210 times.
728 for (; src_x + 1 < src_width; ++src_x) {
367 518 size_t dst_x = src_x * 4 + 2;
368 1036 const uint8_t a = src_row0[src_x], b = src_row0[src_x + 1],
369 1036 c = src_row1[src_x], d = src_row1[src_x + 1];
370
371 518 dst_row0[dst_x + 0] = lerp2d_scalar(49, a, 7, b, 7, c, 1, d);
372 518 dst_row0[dst_x + 1] = lerp2d_scalar(35, a, 21, b, 5, c, 3, d);
373 518 dst_row0[dst_x + 2] = lerp2d_scalar(21, a, 35, b, 3, c, 5, d);
374 518 dst_row0[dst_x + 3] = lerp2d_scalar(7, a, 49, b, 1, c, 7, d);
375 518 dst_row1[dst_x + 0] = lerp2d_scalar(35, a, 5, b, 21, c, 3, d);
376 518 dst_row1[dst_x + 1] = lerp2d_scalar(25, a, 15, b, 15, c, 9, d);
377 518 dst_row1[dst_x + 2] = lerp2d_scalar(15, a, 25, b, 9, c, 15, d);
378 518 dst_row1[dst_x + 3] = lerp2d_scalar(5, a, 35, b, 3, c, 21, d);
379 518 dst_row2[dst_x + 0] = lerp2d_scalar(21, a, 3, b, 35, c, 5, d);
380 518 dst_row2[dst_x + 1] = lerp2d_scalar(15, a, 9, b, 25, c, 15, d);
381 518 dst_row2[dst_x + 2] = lerp2d_scalar(9, a, 15, b, 15, c, 25, d);
382 518 dst_row2[dst_x + 3] = lerp2d_scalar(3, a, 21, b, 5, c, 35, d);
383 518 dst_row3[dst_x + 0] = lerp2d_scalar(7, a, 1, b, 49, c, 7, d);
384 518 dst_row3[dst_x + 1] = lerp2d_scalar(5, a, 3, b, 35, c, 21, d);
385 518 dst_row3[dst_x + 2] = lerp2d_scalar(3, a, 5, b, 21, c, 35, d);
386 518 dst_row3[dst_x + 3] = lerp2d_scalar(1, a, 7, b, 7, c, 49, d);
387 518 }
388 210 };
389
390 // Top rows
391
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
58 if (KLEIDICV_LIKELY(y_begin == 0)) {
392 41 process_edge_row(src, dst);
393 41 memcpy(dst + dst_stride, dst, dst_stride);
394 41 }
395
396 // Middle rows
397
2/2
✓ Branch 0 taken 210 times.
✓ Branch 1 taken 58 times.
268 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
398 210 size_t dst_y = src_y * 4 + 2;
399 210 const uint8_t *src_row0 = src + src_stride * src_y;
400 210 const uint8_t *src_row1 = src_row0 + src_stride;
401 210 uint8_t *dst_row0 = dst + dst_stride * dst_y;
402 210 uint8_t *dst_row1 = dst_row0 + dst_stride;
403 210 uint8_t *dst_row2 = dst_row1 + dst_stride;
404 210 uint8_t *dst_row3 = dst_row2 + dst_stride;
405
406 210 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
407 210 }
408
409 // Bottom rows
410
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
58 if (KLEIDICV_LIKELY(y_end == src_height)) {
411 82 process_edge_row(src + src_stride * (src_height - 1),
412 41 dst + dst_stride * (dst_height - 2));
413 123 memcpy(dst + dst_stride * (dst_height - 1),
414 82 dst + dst_stride * (dst_height - 2), dst_stride);
415 41 }
416
417 58 return KLEIDICV_OK;
418 58 }
419
420 74 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32(
421 const float *src, size_t src_stride, size_t src_width, size_t src_height,
422 size_t y_begin, size_t y_end, float *dst, size_t dst_stride) {
423 74 size_t dst_width = src_width * 2;
424 74 src_stride /= sizeof(float);
425 74 dst_stride /= sizeof(float);
426
427 1274 auto lerp1d_scalar = [](float near, float far) {
428 1200 return near * 0.75F + far * 0.25F;
429 };
430
431 2346 auto lerp1d_vector = [](float32x4_t near, float32x4_t far) {
432 2272 return vmlaq_n_f32(vmulq_n_f32(near, 0.75F), far, 0.25F);
433 };
434
435 2162 auto lerp2d_scalar = [](float near, float mid_a, float mid_b, float far) {
436 2088 return near * 0.5625F + mid_a * 0.1875F + mid_b * 0.1875F + far * 0.0625F;
437 };
438
439 9690 auto lerp2d_vector = [](float32x4_t a, float32x4_t b, float32x4_t c,
440 float32x4_t d) {
441 9616 return vmlaq_n_f32(
442 9616 vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, 0.5625F), b, 0.1875F), c,
443 0.1875F),
444 9616 d, 0.0625F);
445 };
446
447 // Handle top or bottom edge
448 188 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
449 const float *src_row, float *dst_row) {
450 // Left element
451 114 dst_row[0] = src_row[0];
452
453 // Right element
454 114 dst_row[dst_width - 1] = src_row[src_width - 1];
455
456 // Middle elements
457 114 size_t src_x = 0;
458
2/2
✓ Branch 0 taken 1136 times.
✓ Branch 1 taken 114 times.
1250 for (; src_x + 4 < src_width; src_x += 4) {
459 1136 size_t dst_x = src_x * 2 + 1;
460 1136 float32x4_t src_left = vld1q_f32(src_row + src_x);
461 1136 float32x4_t src_right = vld1q_f32(src_row + src_x + 1);
462
463 1136 float32x4_t dst_left = lerp1d_vector(src_left, src_right);
464 1136 float32x4_t dst_right = lerp1d_vector(src_right, src_left);
465
466 1136 vst2q_f32(dst_row + dst_x, (float32x4x2_t{dst_left, dst_right}));
467 1136 }
468
2/2
✓ Branch 0 taken 140 times.
✓ Branch 1 taken 114 times.
254 for (; src_x + 1 < src_width; ++src_x) {
469 140 size_t dst_x = src_x * 2 + 1;
470 140 const float src_left = src_row[src_x], src_right = src_row[src_x + 1];
471 140 dst_row[dst_x] = lerp1d_scalar(src_left, src_right);
472 140 dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left);
473 140 }
474 114 };
475
476 304 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar,
477 lerp2d_vector](const float *src_row0,
478 const float *src_row1, float *dst_row0,
479 float *dst_row1) {
480 // Left element
481 230 dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]);
482 230 dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]);
483
484 // Right element
485 230 dst_row0[dst_width - 1] =
486 230 lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]);
487 230 dst_row1[dst_width - 1] =
488 230 lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]);
489
490 // Middle elements
491 230 size_t src_x = 0;
492
2/2
✓ Branch 0 taken 2404 times.
✓ Branch 1 taken 230 times.
2634 for (; src_x + 4 < src_width; src_x += 4) {
493 2404 size_t dst_x = src_x * 2 + 1;
494
495 2404 float32x4_t a = vld1q_f32(src_row0 + src_x);
496 2404 float32x4_t b = vld1q_f32(src_row0 + src_x + 1);
497 2404 float32x4_t c = vld1q_f32(src_row1 + src_x);
498 2404 float32x4_t d = vld1q_f32(src_row1 + src_x + 1);
499
500 4808 vst2q_f32(dst_row0 + dst_x, (float32x4x2_t{lerp2d_vector(a, b, c, d),
501 2404 lerp2d_vector(b, a, d, c)}));
502 4808 vst2q_f32(dst_row1 + dst_x, (float32x4x2_t{lerp2d_vector(c, a, d, b),
503 2404 lerp2d_vector(d, b, c, a)}));
504 2404 }
505
2/2
✓ Branch 0 taken 522 times.
✓ Branch 1 taken 230 times.
752 for (; src_x + 1 < src_width; ++src_x) {
506 522 size_t dst_x = src_x * 2 + 1;
507 1044 const float a = src_row0[src_x], b = src_row0[src_x + 1],
508 1044 c = src_row1[src_x], d = src_row1[src_x + 1];
509 522 dst_row0[dst_x] = lerp2d_scalar(a, b, c, d);
510 522 dst_row0[dst_x + 1] = lerp2d_scalar(b, a, d, c);
511 522 dst_row1[dst_x] = lerp2d_scalar(c, a, d, b);
512 522 dst_row1[dst_x + 1] = lerp2d_scalar(d, b, c, a);
513 522 }
514 230 };
515
516 // Top row
517
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 57 times.
74 if (KLEIDICV_LIKELY(y_begin == 0)) {
518 57 process_edge_row(src, dst);
519 57 }
520
521 // Middle rows
522
2/2
✓ Branch 0 taken 230 times.
✓ Branch 1 taken 74 times.
304 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
523 230 size_t dst_y = src_y * 2 + 1;
524 230 const float *src_row0 = src + src_stride * src_y;
525 230 const float *src_row1 = src_row0 + src_stride;
526 230 float *dst_row0 = dst + dst_stride * dst_y;
527 230 float *dst_row1 = dst_row0 + dst_stride;
528
529 230 process_row(src_row0, src_row1, dst_row0, dst_row1);
530 230 }
531
532 // Bottom row
533
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 57 times.
74 if (KLEIDICV_LIKELY(y_end == src_height)) {
534 114 process_edge_row(src + src_stride * (src_height - 1),
535 57 dst + dst_stride * (src_height * 2 - 1));
536 57 }
537
538 74 return KLEIDICV_OK;
539 74 }
540
541 62 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32(
542 const float *src, size_t src_stride, size_t src_width, size_t src_height,
543 size_t y_begin, size_t y_end, float *dst, size_t dst_stride) {
544 using T = float;
545 62 size_t dst_height = src_height * 4;
546 62 size_t dst_width = src_width * 4;
547 62 src_stride /= sizeof(T);
548 62 dst_stride /= sizeof(T);
549
550 2174 auto lerp1d_scalar = [](T coeff_a, T a, T coeff_b, T b) {
551 2112 return coeff_a * a + coeff_b * b;
552 };
553 23838 auto lerp1d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b) {
554 23776 return vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b);
555 };
556 7646 auto lerp2d_scalar = [](T coeff_a, T a, T coeff_b, T b, T coeff_c, T c,
557 T coeff_d, T d) {
558 7584 return coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d;
559 };
560 19294 auto lerp2d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b,
561 T coeff_c, float32x4_t c, T coeff_d, float32x4_t d) {
562 19232 return vmlaq_n_f32(
563 38464 vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b), c,
564 19232 coeff_c),
565 19232 d, coeff_d);
566 };
567 // Handle top or bottom edge
568 152 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
569 const T *src_row, T *dst_row) {
570 // Left elements
571 90 dst_row[1] = dst_row[0] = src_row[0];
572
573 // Right elements
574 90 dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
575
576 // Middle elements
577 90 size_t src_x = 0;
578
2/2
✓ Branch 0 taken 1136 times.
✓ Branch 1 taken 90 times.
1226 for (; src_x + 4 < src_width; src_x += 4) {
579 1136 size_t dst_x = src_x * 4 + 2;
580 1136 float32x4_t a = vld1q_f32(src_row + src_x);
581 1136 float32x4_t b = vld1q_f32(src_row + src_x + 1);
582 2272 vst4q_f32(dst_row + dst_x,
583 4544 (float32x4x4_t{lerp1d_vector(0.875F, a, 0.125F, b),
584 1136 lerp1d_vector(0.625F, a, 0.375F, b),
585 1136 lerp1d_vector(0.375F, a, 0.625F, b),
586 1136 lerp1d_vector(0.125F, a, 0.875F, b)}));
587 1136 }
588
2/2
✓ Branch 0 taken 100 times.
✓ Branch 1 taken 90 times.
190 for (; src_x + 1 < src_width; ++src_x) {
589 100 size_t dst_x = src_x * 4 + 2;
590 100 const T a = src_row[src_x], b = src_row[src_x + 1];
591 100 dst_row[dst_x + 0] = lerp1d_scalar(0.875F, a, 0.125F, b);
592 100 dst_row[dst_x + 1] = lerp1d_scalar(0.625F, a, 0.375F, b);
593 100 dst_row[dst_x + 2] = lerp1d_scalar(0.375F, a, 0.625F, b);
594 100 dst_row[dst_x + 3] = lerp1d_scalar(0.125F, a, 0.875F, b);
595 100 }
596 90 };
597
598 276 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector,
599 lerp2d_scalar, lerp2d_vector](
600 const T *src_row0, const T *src_row1, T *dst_row0,
601 T *dst_row1, T *dst_row2, T *dst_row3) {
602 // Left elements
603 214 const T s0l = src_row0[0], s1l = src_row1[0];
604 214 dst_row0[0] = dst_row0[1] = lerp1d_scalar(0.875F, s0l, 0.125F, s1l);
605 214 dst_row1[0] = dst_row1[1] = lerp1d_scalar(0.625F, s0l, 0.375F, s1l);
606 214 dst_row2[0] = dst_row2[1] = lerp1d_scalar(0.375F, s0l, 0.625F, s1l);
607 214 dst_row3[0] = dst_row3[1] = lerp1d_scalar(0.125F, s0l, 0.875F, s1l);
608
609 // Right elements
610 214 const T s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1];
611 214 const size_t dr0 = dst_width - 2;
612 214 const size_t dr1 = dst_width - 1;
613 214 dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(0.875F, s0r, 0.125F, s1r);
614 214 dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(0.625F, s0r, 0.375F, s1r);
615 214 dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(0.375F, s0r, 0.625F, s1r);
616 214 dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(0.125F, s0r, 0.875F, s1r);
617
618 // Middle elements
619 214 size_t src_x = 0;
620
2/2
✓ Branch 0 taken 2404 times.
✓ Branch 1 taken 214 times.
2618 for (; src_x + 4 < src_width; src_x += 4) {
621 2404 size_t dst_x = src_x * 4 + 2;
622
623 2404 float32x4_t a = vld1q_f32(src_row0 + src_x);
624 2404 float32x4_t b = vld1q_f32(src_row0 + src_x + 1);
625 2404 float32x4_t c = vld1q_f32(src_row1 + src_x);
626 2404 float32x4_t d = vld1q_f32(src_row1 + src_x + 1);
627
628 2404 float32x4x4_t dst_a{
629 9616 lerp2d_vector(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d),
630 2404 lerp2d_vector(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d),
631 2404 lerp2d_vector(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d),
632 2404 lerp2d_vector(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d),
633 };
634 2404 float32x4x4_t dst_d{
635 9616 lerp2d_vector(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d),
636 2404 lerp2d_vector(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d),
637 2404 lerp2d_vector(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d),
638 2404 lerp2d_vector(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d),
639 };
640 2404 const float one_3rd = 0.3333333333333333F;
641 2404 const float two_3rd = 0.6666666666666667F;
642 2404 vst4q_f32(dst_row0 + dst_x, dst_a);
643 4808 vst4q_f32(dst_row1 + dst_x,
644 2404 (float32x4x4_t{
645 9616 lerp1d_vector(two_3rd, dst_a.val[0], one_3rd, dst_d.val[0]),
646 2404 lerp1d_vector(two_3rd, dst_a.val[1], one_3rd, dst_d.val[1]),
647 2404 lerp1d_vector(two_3rd, dst_a.val[2], one_3rd, dst_d.val[2]),
648 2404 lerp1d_vector(two_3rd, dst_a.val[3], one_3rd, dst_d.val[3]),
649 }));
650 4808 vst4q_f32(dst_row2 + dst_x,
651 2404 (float32x4x4_t{
652 9616 lerp1d_vector(one_3rd, dst_a.val[0], two_3rd, dst_d.val[0]),
653 2404 lerp1d_vector(one_3rd, dst_a.val[1], two_3rd, dst_d.val[1]),
654 2404 lerp1d_vector(one_3rd, dst_a.val[2], two_3rd, dst_d.val[2]),
655 2404 lerp1d_vector(one_3rd, dst_a.val[3], two_3rd, dst_d.val[3]),
656 }));
657 2404 vst4q_f32(dst_row3 + dst_x, dst_d);
658 2404 }
659
660
2/2
✓ Branch 0 taken 474 times.
✓ Branch 1 taken 214 times.
688 for (; src_x + 1 < src_width; ++src_x) {
661 474 size_t dst_x = src_x * 4 + 2;
662 474 const T a = src_row0[src_x], b = src_row0[src_x + 1], c = src_row1[src_x],
663 474 d = src_row1[src_x + 1];
664
665 474 dst_row0[dst_x + 0] =
666 474 lerp2d_scalar(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d);
667 474 dst_row0[dst_x + 1] =
668 474 lerp2d_scalar(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d);
669 474 dst_row0[dst_x + 2] =
670 474 lerp2d_scalar(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d);
671 474 dst_row0[dst_x + 3] =
672 474 lerp2d_scalar(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d);
673 474 dst_row1[dst_x + 0] =
674 474 lerp2d_scalar(0.546875F, a, 0.078125F, b, 0.328125F, c, 0.046875F, d);
675 474 dst_row1[dst_x + 1] =
676 474 lerp2d_scalar(0.390625F, a, 0.234375F, b, 0.234375F, c, 0.140625F, d);
677 474 dst_row1[dst_x + 2] =
678 474 lerp2d_scalar(0.234375F, a, 0.390625F, b, 0.140625F, c, 0.234375F, d);
679 474 dst_row1[dst_x + 3] =
680 474 lerp2d_scalar(0.078125F, a, 0.546875F, b, 0.046875F, c, 0.328125F, d);
681 474 dst_row2[dst_x + 0] =
682 474 lerp2d_scalar(0.328125F, a, 0.046875F, b, 0.546875F, c, 0.078125F, d);
683 474 dst_row2[dst_x + 1] =
684 474 lerp2d_scalar(0.234375F, a, 0.140625F, b, 0.390625F, c, 0.234375F, d);
685 474 dst_row2[dst_x + 2] =
686 474 lerp2d_scalar(0.140625F, a, 0.234375F, b, 0.234375F, c, 0.390625F, d);
687 474 dst_row2[dst_x + 3] =
688 474 lerp2d_scalar(0.046875F, a, 0.328125F, b, 0.078125F, c, 0.546875F, d);
689 474 dst_row3[dst_x + 0] =
690 474 lerp2d_scalar(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d);
691 474 dst_row3[dst_x + 1] =
692 474 lerp2d_scalar(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d);
693 474 dst_row3[dst_x + 2] =
694 474 lerp2d_scalar(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d);
695 474 dst_row3[dst_x + 3] =
696 474 lerp2d_scalar(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d);
697 474 }
698 214 };
699
700 // Top rows
701
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 45 times.
62 if (KLEIDICV_LIKELY(y_begin == 0)) {
702 45 process_edge_row(src, dst);
703 45 memcpy(dst + dst_stride, dst, dst_stride * sizeof(T));
704 45 }
705
706 // Middle rows
707
2/2
✓ Branch 0 taken 214 times.
✓ Branch 1 taken 62 times.
276 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
708 214 size_t dst_y = src_y * 4 + 2;
709 214 const T *src_row0 = src + src_stride * src_y;
710 214 const T *src_row1 = src_row0 + src_stride;
711 214 T *dst_row0 = dst + dst_stride * dst_y;
712 214 T *dst_row1 = dst_row0 + dst_stride;
713 214 T *dst_row2 = dst_row1 + dst_stride;
714 214 T *dst_row3 = dst_row2 + dst_stride;
715
716 214 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
717 214 }
718
719 // Bottom rows
720
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 45 times.
62 if (KLEIDICV_LIKELY(y_end == src_height)) {
721 90 process_edge_row(src + src_stride * (src_height - 1),
722 45 dst + dst_stride * (dst_height - 2));
723 135 memcpy(dst + dst_stride * (dst_height - 1),
724 90 dst + dst_stride * (dst_height - 2), dst_stride * sizeof(T));
725 45 }
726
727 62 return KLEIDICV_OK;
728 62 }
729
730 58 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32(
731 const float *src, size_t src_stride, size_t src_width, size_t src_height,
732 size_t y_begin, size_t y_end, float *dst, size_t dst_stride) {
733 58 size_t dst_width = src_width * 8;
734 58 size_t dst_height = src_height * 8;
735 58 src_stride /= sizeof(float);
736 58 dst_stride /= sizeof(float);
737
738 58 float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0,
739 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0};
740 58 float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0,
741 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0};
742 58 float32x4_t coeffs_a0 = vld1q_f32(&coeffs_a[0]);
743 58 float32x4_t coeffs_a1 = vld1q_f32(&coeffs_a[4]);
744 58 float32x4_t coeffs_b0 = vld1q_f32(&coeffs_b[0]);
745 58 float32x4_t coeffs_b1 = vld1q_f32(&coeffs_b[4]);
746
747 3418 auto lerp1d_vector_n = [](float p, float32x4_t a, float q, float32x4_t b) {
748 3360 return vmlaq_n_f32(vmulq_n_f32(a, p), b, q);
749 };
750
751 121090 auto lerp1d_vector_n2 = [](float32x4_t a, float q, float32x4_t b) {
752 121032 return vmlaq_n_f32(a, b, q);
753 };
754
755 9330 auto lerp1d_vector = [](float32x4_t p, float32x4_t a, float32x4_t q,
756 float32x4_t b) {
757 9272 return vmlaq_f32(vmulq_f32(a, p), b, q);
758 };
759
760 // Handle top or bottom edge
761 58 auto process_edge_row =
762 140 [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0,
763 &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) {
764 // Left elements
765 82 dst_row[3] = dst_row[2] = dst_row[1] = dst_row[0] = src_row[0];
766 82 dst_row[dst_stride + 3] = dst_row[dst_stride + 2] =
767 82 dst_row[dst_stride + 1] = dst_row[dst_stride] = src_row[0];
768 82 dst_row[2 * dst_stride + 3] = dst_row[2 * dst_stride + 2] =
769 82 dst_row[2 * dst_stride + 1] = dst_row[2 * dst_stride] = src_row[0];
770 82 dst_row[3 * dst_stride + 3] = dst_row[3 * dst_stride + 2] =
771 82 dst_row[3 * dst_stride + 1] = dst_row[3 * dst_stride] = src_row[0];
772
773 // Right elements
774 82 float *dst_right = dst_row + dst_width - 4;
775 82 dst_right[3] = dst_right[2] = dst_right[1] = dst_right[0] =
776 82 src_row[src_width - 1];
777 82 dst_right[dst_stride + 3] = dst_right[dst_stride + 2] =
778 82 dst_right[dst_stride + 1] = dst_right[dst_stride] =
779 82 src_row[src_width - 1];
780 82 dst_right[2 * dst_stride + 3] = dst_right[2 * dst_stride + 2] =
781 82 dst_right[2 * dst_stride + 1] = dst_right[2 * dst_stride] =
782 82 src_row[src_width - 1];
783 82 dst_right[3 * dst_stride + 3] = dst_right[3 * dst_stride + 2] =
784 82 dst_right[3 * dst_stride + 1] = dst_right[3 * dst_stride] =
785 82 src_row[src_width - 1];
786
787 // Middle elements
788 82 float32x4_t a, b = vdupq_n_f32(src_row[0]);
789
2/2
✓ Branch 0 taken 82 times.
✓ Branch 1 taken 4636 times.
4718 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
790 4636 a = b;
791 4636 b = vdupq_n_f32(src_row[src_x + 1]);
792 4636 float *dst_row0 = dst_row + src_x * 8 + 4;
793 4636 float *dst_row1 = dst_row0 + dst_stride;
794 4636 float *dst_row2 = dst_row1 + dst_stride;
795 4636 float *dst_row3 = dst_row2 + dst_stride;
796 4636 float32x4_t dst = lerp1d_vector(coeffs_a0, a, coeffs_b0, b);
797 4636 vst1q(dst_row0, dst);
798 4636 vst1q(dst_row1, dst);
799 4636 vst1q(dst_row2, dst);
800 4636 vst1q(dst_row3, dst);
801 4636 dst = lerp1d_vector(coeffs_a1, a, coeffs_b1, b);
802 4636 vst1q(dst_row0 + 4, dst);
803 4636 vst1q(dst_row1 + 4, dst);
804 4636 vst1q(dst_row2 + 4, dst);
805 4636 vst1q(dst_row3 + 4, dst);
806 4636 }
807 82 };
808
809 58 float32x4_t coeffs_p0 = vmulq_n_f32(coeffs_a0, 15.0 / 16);
810 58 float32x4_t coeffs_q0 = vmulq_n_f32(coeffs_b0, 15.0 / 16);
811 58 float32x4_t coeffs_r0 = vmulq_n_f32(coeffs_a0, 1.0 / 16);
812 58 float32x4_t coeffs_s0 = vmulq_n_f32(coeffs_b0, 1.0 / 16);
813 58 float32x4_t coeffs_p1 = vmulq_n_f32(coeffs_a1, 15.0 / 16);
814 58 float32x4_t coeffs_q1 = vmulq_n_f32(coeffs_b1, 15.0 / 16);
815 58 float32x4_t coeffs_r1 = vmulq_n_f32(coeffs_a1, 1.0 / 16);
816 58 float32x4_t coeffs_s1 = vmulq_n_f32(coeffs_b1, 1.0 / 16);
817
818 40402 auto lerp2d_vector = [](float32x4_t a, float32x4_t p, float32x4_t b,
819 float32x4_t q, float32x4_t c, float32x4_t r,
820 float32x4_t d, float32x4_t s) {
821 40344 return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(a, p), b, q), c, r), d, s);
822 };
823
824 268 auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n,
825 lerp1d_vector_n2, &coeffs_p0, &coeffs_q0, &coeffs_r0,
826 &coeffs_s0, &coeffs_p1, &coeffs_q1, &coeffs_r1,
827 &coeffs_s1](const float *src_row0, const float *src_row1,
828 float *dst_row0, size_t dst_stride) {
829 // Left elements
830 210 float32x4_t s0 = vdupq_n_f32(src_row0[0]);
831 210 float32x4_t s1 = vdupq_n_f32(src_row1[0]);
832 210 float *dst_row = dst_row0;
833
2/2
✓ Branch 0 taken 1680 times.
✓ Branch 1 taken 210 times.
1890 for (size_t i = 0; i < 8; ++i) {
834 3360 vst1q(dst_row,
835 3360 lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0,
836 1680 static_cast<float>(i * 2 + 1) / 16.0F, s1));
837 1680 dst_row += dst_stride;
838 1680 }
839
840 // Middle elements
841 210 dst_row0 += 4;
842 210 float *dst_row1 = dst_row0 + dst_stride;
843 210 float *dst_row2 = dst_row1 + dst_stride;
844 210 float *dst_row3 = dst_row2 + dst_stride;
845 210 float *dst_row4 = dst_row3 + dst_stride;
846 210 float *dst_row5 = dst_row4 + dst_stride;
847 210 float *dst_row6 = dst_row5 + dst_stride;
848 210 float *dst_row7 = dst_row6 + dst_stride;
849 210 float32x4_t a, b = s0;
850 210 float32x4_t c, d = s1;
851
2/2
✓ Branch 0 taken 10086 times.
✓ Branch 1 taken 210 times.
10296 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
852 10086 KLEIDICV_PREFETCH(dst_row0 + 64);
853 10086 KLEIDICV_PREFETCH(dst_row1 + 64);
854 10086 KLEIDICV_PREFETCH(dst_row2 + 64);
855 10086 KLEIDICV_PREFETCH(dst_row3 + 64);
856 10086 KLEIDICV_PREFETCH(dst_row4 + 64);
857 10086 KLEIDICV_PREFETCH(dst_row5 + 64);
858 10086 KLEIDICV_PREFETCH(dst_row6 + 64);
859 10086 KLEIDICV_PREFETCH(dst_row7 + 64);
860 10086 a = b;
861 10086 b = vdupq_n_f32(src_row0[src_x + 1]);
862 10086 c = d;
863 10086 d = vdupq_n_f32(src_row1[src_x + 1]);
864 10086 float32x4x2_t dst_0;
865 10086 dst_0.val[0] =
866 10086 lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d);
867 10086 dst_0.val[1] =
868 10086 lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d);
869
870 10086 neon::VecTraits<float>::store(dst_0, dst_row0);
871 10086 float32x4x2_t dst_7;
872 10086 dst_7.val[0] =
873 10086 lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d);
874 10086 dst_7.val[1] =
875 10086 lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d);
876
877 10086 neon::VecTraits<float>::store(dst_7, dst_row7);
878 10086 float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]);
879 10086 float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]);
880
881 10086 float32x4x2_t dst;
882 10086 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0);
883 10086 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1);
884
885 10086 neon::VecTraits<float>::store(dst, dst_row1);
886 10086 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0);
887 10086 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1);
888
889 10086 neon::VecTraits<float>::store(dst, dst_row2);
890 10086 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0);
891 10086 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1);
892
893 10086 neon::VecTraits<float>::store(dst, dst_row3);
894 10086 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0);
895 10086 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1);
896
897 10086 neon::VecTraits<float>::store(dst, dst_row4);
898 10086 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0);
899 10086 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1);
900
901 10086 neon::VecTraits<float>::store(dst, dst_row5);
902 10086 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0);
903 10086 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1);
904
905 10086 neon::VecTraits<float>::store(dst, dst_row6);
906 10086 dst_row0 += 8;
907 10086 dst_row1 += 8;
908 10086 dst_row2 += 8;
909 10086 dst_row3 += 8;
910 10086 dst_row4 += 8;
911 10086 dst_row5 += 8;
912 10086 dst_row6 += 8;
913 10086 dst_row7 += 8;
914 10086 }
915
916 // Right elements
917 210 s0 = b;
918 210 s1 = d;
919 210 dst_row = dst_row0;
920
2/2
✓ Branch 0 taken 210 times.
✓ Branch 1 taken 1680 times.
1890 for (size_t i = 0; i < 8; ++i) {
921 3360 vst1q(dst_row,
922 3360 lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0,
923 1680 static_cast<float>(i * 2 + 1) / 16.0F, s1));
924 1680 dst_row += dst_stride;
925 1680 }
926 210 };
927
928 // Top rows
929
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
58 if (KLEIDICV_LIKELY(y_begin == 0)) {
930 41 process_edge_row(src, dst, dst_stride);
931 41 }
932
933 // Middle rows
934
2/2
✓ Branch 0 taken 210 times.
✓ Branch 1 taken 58 times.
268 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
935 210 size_t dst_y = src_y * 8 + 4;
936 210 const float *src_row0 = src + src_stride * src_y;
937 210 const float *src_row1 = src_row0 + src_stride;
938 210 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride);
939 210 }
940
941 // Bottom rows
942
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
58 if (KLEIDICV_LIKELY(y_end == src_height)) {
943 82 process_edge_row(src + src_stride * (src_height - 1),
944 41 dst + dst_stride * (dst_height - 4), dst_stride);
945 41 }
946
947 58 return KLEIDICV_OK;
948 58 }
949
950 196 kleidicv_error_t kleidicv_resize_linear_stripe_f32(
951 const float *src, size_t src_stride, size_t src_width, size_t src_height,
952 size_t y_begin, size_t y_end, float *dst, size_t dst_stride,
953 size_t dst_width, size_t dst_height) {
954
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 195 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 195 times.
196 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
955
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 194 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 194 times.
195 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
956
957
2/4
✓ Branch 0 taken 194 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 194 times.
194 if (src_width == 0 || src_height == 0) {
958 return KLEIDICV_OK;
959 }
960
3/4
✓ Branch 0 taken 74 times.
✓ Branch 1 taken 120 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 74 times.
194 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
961 148 return resize_2x2_f32(src, src_stride, src_width, src_height, y_begin,
962 74 y_end, dst, dst_stride);
963 }
964
3/4
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 58 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 62 times.
120 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
965 124 return resize_4x4_f32(src, src_stride, src_width, src_height, y_begin,
966 62 y_end, dst, dst_stride);
967 }
968
2/4
✓ Branch 0 taken 58 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 58 times.
58 if (src_width * 8 == dst_width && src_height * 8 == dst_height) {
969 116 return resize_8x8_f32(src, src_stride, src_width, src_height, y_begin,
970 58 y_end, dst, dst_stride);
971 }
972 // resize_linear_f32_is_implemented checked the kernel size already.
973 // GCOVR_EXCL_START
974 assert(!"resize ratio not implemented");
975 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
976 // GCOVR_EXCL_STOP
977 196 }
978 } // namespace kleidicv::neon
979