KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/resize/resize_linear_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 630 630 100.0%
Functions: 39 39 100.0%
Branches: 119 126 94.4%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "kleidicv/operations.h"
10 #include "kleidicv/resize/resize_linear.h"
11
12 namespace kleidicv::neon {
13
14 template <uint8_t P, uint8_t Q, uint8_t Bias, uint8_t Shift>
15 8848 uint8x8_t lerp2d_vector_p_q_q_1(uint8x8_t a, uint8x8_t b, uint8x8_t c,
16 uint8x8_t d) {
17 // b + c
18 8848 uint16x8_t b_c = vaddl_u8(b, c);
19
20 // a * p
21 8848 uint16x8_t ap = vmull_u8(a, vdup_n_u8(P));
22
23 // a * p + (b + c) * q
24 8848 uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q));
25
26 // d + bias
27 8848 uint16x8_t d_bias = vaddl_u8(d, vdup_n_u8(Bias));
28
29 // a * p + (b + c) * q + d + bias
30 8848 uint16x8_t ap_bcq_d_bias = vaddq_u16(ap_bcq, d_bias);
31
32 // (a * p + (b + c) * q + d + bias) >> shift
33 8848 uint8x8_t result = vshrn_n_u16(ap_bcq_d_bias, Shift);
34 17696 return result;
35 8848 }
36
37 template <uint8_t P, uint8_t Q, uint8_t R, uint8_t Bias, uint8_t Shift>
38 4424 uint8x8_t lerp2d_vector_p_q_q_r(uint8x8_t a, uint8x8_t b, uint8x8_t c,
39 uint8x8_t d) {
40 // b + c
41 4424 uint16x8_t b_c = vaddl_u8(b, c);
42
43 // a * p
44 4424 uint16x8_t ap = vmull_u8(a, vdup_n_u8(P));
45
46 // d * r
47 4424 uint16x8_t dr = vmull_u8(d, vdup_n_u8(R));
48
49 // a * p + (b + c) * q
50 4424 uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q));
51
52 // d * r + bias
53 4424 uint16x8_t dr_bias = vaddq_u16(dr, vdupq_n_u16(Bias));
54
55 // a * p + (b + c) * q + d * r + bias
56 4424 uint16x8_t ap_bcq_dr_bias = vaddq_u16(ap_bcq, dr_bias);
57
58 // (a * p + (b + c) * q + d * r + bias) >> shift
59 4424 uint8x8_t result = vshrn_n_u16(ap_bcq_dr_bias, Shift);
60 8848 return result;
61 4424 }
62
63 54 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8(
64 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
65 size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) {
66 54 size_t dst_width = src_width * 2;
67
68 926 auto lerp1d_scalar = [](uint8_t near, uint8_t far) {
69 872 return (near * 3 + far + 2) >> 2;
70 };
71
72 1150 auto lerp1d_vector = [](uint8x8_t near, uint8x8_t far) {
73 1096 uint8x8_t three = vdup_n_u8(3);
74 1096 uint8x8_t two = vdup_n_u8(2);
75
76 // near * 3
77 1096 uint16x8_t near3 = vmull_u8(near, three);
78
79 // far + 2
80 1096 uint16x8_t far_2 = vaddl_u8(far, two);
81
82 // near * 3 + far * 2
83 1096 uint16x8_t near3_far_2 = vaddq_u16(near3, far_2);
84
85 // (near * 3 + far * 2) / 4
86 1096 uint8x8_t near3_far_2_div4 = vshrn_n_u16(near3_far_2, 2);
87
88 2192 return near3_far_2_div4;
89 1096 };
90
91 1382 auto lerp2d_scalar = [](uint8_t near, uint8_t mid_a, uint8_t mid_b,
92 uint8_t far) {
93 1328 return (near * 9 + (mid_a + mid_b) * 3 + far + 8) >> 4;
94 };
95
96 4478 auto lerp2d_vector = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, uint8x8_t d) {
97 4424 return lerp2d_vector_p_q_q_1<9, 3, 8, 4>(a, b, c, d);
98 };
99
100 // Handle top or bottom edge
101 160 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
102 const uint8_t *src_row, uint8_t *dst_row) {
103 // Left element
104 106 dst_row[0] = src_row[0];
105
106 // Right element
107 106 dst_row[dst_width - 1] = src_row[src_width - 1];
108
109 // Middle elements
110 106 size_t src_x = 0;
111
2/2
✓ Branch 0 taken 548 times.
✓ Branch 1 taken 106 times.
654 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
112 548 size_t dst_x = src_x * 2 + 1;
113 548 uint8x8_t src_left = vld1_u8(src_row + src_x);
114 548 uint8x8_t src_right = vld1_u8(src_row + src_x + 1);
115
116 548 uint8x8_t dst_left = lerp1d_vector(src_left, src_right);
117 548 uint8x8_t dst_right = lerp1d_vector(src_right, src_left);
118
119 548 vst2_u8(dst_row + dst_x, (uint8x8x2_t{dst_left, dst_right}));
120 548 }
121
2/2
✓ Branch 0 taken 140 times.
✓ Branch 1 taken 106 times.
246 for (; src_x + 1 < src_width; ++src_x) {
122 140 size_t dst_x = src_x * 2 + 1;
123 140 const uint8_t src_left = src_row[src_x], src_right = src_row[src_x + 1];
124 140 dst_row[dst_x] = lerp1d_scalar(src_left, src_right);
125 140 dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left);
126 140 }
127 106 };
128
129 202 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar,
130 lerp2d_vector](const uint8_t *src_row0,
131 const uint8_t *src_row1, uint8_t *dst_row0,
132 uint8_t *dst_row1) {
133 // Left element
134 148 dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]);
135 148 dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]);
136
137 // Right element
138 148 dst_row0[dst_width - 1] =
139 148 lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]);
140 148 dst_row1[dst_width - 1] =
141 148 lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]);
142
143 // Middle elements
144 148 size_t src_x = 0;
145
2/2
✓ Branch 0 taken 1106 times.
✓ Branch 1 taken 148 times.
1254 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
146 1106 size_t dst_x = src_x * 2 + 1;
147
148 1106 uint8x8_t src_tl = vld1_u8(src_row0 + src_x);
149 1106 uint8x8_t src_tr = vld1_u8(src_row0 + src_x + 1);
150 1106 uint8x8_t src_bl = vld1_u8(src_row1 + src_x);
151 1106 uint8x8_t src_br = vld1_u8(src_row1 + src_x + 1);
152
153 1106 uint8x8_t dst_tl = lerp2d_vector(src_tl, src_tr, src_bl, src_br);
154 1106 uint8x8_t dst_tr = lerp2d_vector(src_tr, src_tl, src_br, src_bl);
155 1106 uint8x8_t dst_bl = lerp2d_vector(src_bl, src_tl, src_br, src_tr);
156 1106 uint8x8_t dst_br = lerp2d_vector(src_br, src_tr, src_bl, src_tl);
157
158 1106 vst2_u8(dst_row0 + dst_x, (uint8x8x2_t{dst_tl, dst_tr}));
159 1106 vst2_u8(dst_row1 + dst_x, (uint8x8x2_t{dst_bl, dst_br}));
160 1106 }
161
2/2
✓ Branch 0 taken 332 times.
✓ Branch 1 taken 148 times.
480 for (; src_x + 1 < src_width; ++src_x) {
162 332 size_t dst_x = src_x * 2 + 1;
163 664 const uint8_t src_tl = src_row0[src_x], src_tr = src_row0[src_x + 1],
164 664 src_bl = src_row1[src_x], src_br = src_row1[src_x + 1];
165 332 dst_row0[dst_x] = lerp2d_scalar(src_tl, src_tr, src_bl, src_br);
166 332 dst_row0[dst_x + 1] = lerp2d_scalar(src_tr, src_tl, src_br, src_bl);
167 332 dst_row1[dst_x] = lerp2d_scalar(src_bl, src_tl, src_br, src_tr);
168 332 dst_row1[dst_x + 1] = lerp2d_scalar(src_br, src_tr, src_bl, src_tl);
169 332 }
170 148 };
171
172 // Top row
173
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 53 times.
54 if (KLEIDICV_LIKELY(y_begin == 0)) {
174 53 process_edge_row(src, dst);
175 53 }
176
177 // Middle rows
178
2/2
✓ Branch 0 taken 148 times.
✓ Branch 1 taken 54 times.
202 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
179 148 size_t dst_y = src_y * 2 + 1;
180 148 const uint8_t *src_row0 = src + src_stride * src_y;
181 148 const uint8_t *src_row1 = src_row0 + src_stride;
182 148 uint8_t *dst_row0 = dst + dst_stride * dst_y;
183 148 uint8_t *dst_row1 = dst_row0 + dst_stride;
184
185 148 process_row(src_row0, src_row1, dst_row0, dst_row1);
186 148 }
187
188 // Bottom row
189
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 53 times.
54 if (KLEIDICV_LIKELY(y_end == src_height)) {
190 106 process_edge_row(src + src_stride * (src_height - 1),
191 53 dst + dst_stride * (src_height * 2 - 1));
192 53 }
193
194 54 return KLEIDICV_OK;
195 54 }
196
197 34 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8(
198 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
199 size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) {
200 34 size_t dst_width = src_width * 4, dst_height = src_height * 4;
201
202 1394 auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b,
203 uint8_t b) {
204 1360 return (coeff_a * a + coeff_b * b + 4) >> 3;
205 };
206 2226 auto lerp1d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a,
207 uint8_t coeff_b_scalar, uint8x8_t b) {
208 2192 uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar);
209 2192 uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar);
210 2192 uint16x8_t four = vdupq_n_u16(4);
211
212 // a * coeff_a
213 2192 uint16x8_t a1 = vmull_u8(a, coeff_a);
214
215 // b * coeff_b
216 2192 uint16x8_t b1 = vmull_u8(b, coeff_b);
217
218 // a * coeff_a + b * coeff_b
219 2192 uint16x8_t a1_b1 = vaddq_u16(a1, b1);
220
221 // a * coeff_a + b * coeff_b + 4
222 2192 uint16x8_t a1_b1_4 = vaddq_u16(a1_b1, four);
223
224 // (a * coeff_a + b * coeff_b + 4) / 8
225 2192 uint8x8_t result = vshrn_n_u16(a1_b1_4, 3);
226
227 4384 return result;
228 2192 };
229 4514 auto lerp2d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b,
230 uint8_t b, uint8_t coeff_c, uint8_t c,
231 uint8_t coeff_d, uint8_t d) {
232 4480 return (coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d + 32) >> 6;
233 };
234 8882 auto lerp2d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a,
235 uint8_t coeff_b_scalar, uint8x8_t b,
236 uint8_t coeff_c_scalar, uint8x8_t c,
237 uint8_t coeff_d_scalar, uint8x8_t d) {
238 8848 uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar);
239 8848 uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar);
240 8848 uint8x8_t coeff_c = vdup_n_u8(coeff_c_scalar);
241 8848 uint8x8_t coeff_d = vdup_n_u8(coeff_d_scalar);
242 8848 uint16x8_t thirtytwo = vdupq_n_u16(32);
243
244 // a * coeff_a
245 8848 uint16x8_t a1 = vmull_u8(a, coeff_a);
246
247 // b * coeff_b
248 8848 uint16x8_t b1 = vmull_u8(b, coeff_b);
249
250 // c * coeff_c
251 8848 uint16x8_t c1 = vmull_u8(c, coeff_c);
252
253 // d * coeff_d
254 8848 uint16x8_t d1 = vmull_u8(d, coeff_d);
255
256 // a * coeff_a + b * coeff_b
257 8848 uint16x8_t a1_b1 = vaddq_u16(a1, b1);
258
259 // c * coeff_c + d * coeff_d
260 8848 uint16x8_t c1_d1 = vaddq_u16(c1, d1);
261
262 // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d
263 8848 uint16x8_t a1_b1_c1_d1 = vaddq_u16(a1_b1, c1_d1);
264
265 // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32
266 8848 uint16x8_t a1_b1_c1_d1_32 = vaddq_u16(a1_b1_c1_d1, thirtytwo);
267
268 // (a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32) / 64
269 8848 uint8x8_t result = vshrn_n_u16(a1_b1_c1_d1_32, 6);
270 17696 return result;
271 8848 };
272 // Handle top or bottom edge
273 100 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
274 const uint8_t *src_row, uint8_t *dst_row) {
275 // Left elements
276 66 dst_row[1] = dst_row[0] = src_row[0];
277
278 // Right elements
279 66 dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
280
281 // Middle elements
282 66 size_t src_x = 0;
283
2/2
✓ Branch 0 taken 548 times.
✓ Branch 1 taken 66 times.
614 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
284 548 size_t dst_x = src_x * 4 + 2;
285 548 uint8x8_t a = vld1_u8(src_row + src_x);
286 548 uint8x8_t b = vld1_u8(src_row + src_x + 1);
287 548 uint8x8x4_t interpolated = {
288 1644 lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b),
289 1096 lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)};
290
291 548 vst4_u8(dst_row + dst_x, interpolated);
292 548 }
293
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 66 times.
150 for (; src_x + 1 < src_width; ++src_x) {
294 84 size_t dst_x = src_x * 4 + 2;
295 84 const uint8_t a = src_row[src_x], b = src_row[src_x + 1];
296 84 dst_row[dst_x + 0] = lerp1d_scalar(7, a, 1, b);
297 84 dst_row[dst_x + 1] = lerp1d_scalar(5, a, 3, b);
298 84 dst_row[dst_x + 2] = lerp1d_scalar(3, a, 5, b);
299 84 dst_row[dst_x + 3] = lerp1d_scalar(1, a, 7, b);
300 84 }
301 66 };
302
303 162 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar,
304 lerp2d_vector](const uint8_t *src_row0,
305 const uint8_t *src_row1, uint8_t *dst_row0,
306 uint8_t *dst_row1, uint8_t *dst_row2,
307 uint8_t *dst_row3) {
308 4552 auto lerp2d_vector_49_7_7_1 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c,
309 uint8x8_t d) {
310 4424 return lerp2d_vector_p_q_q_1<49, 7, 32, 6>(a, b, c, d);
311 };
312 4552 auto lerp2d_vector_25_15_15_9 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c,
313 uint8x8_t d) {
314 4424 return lerp2d_vector_p_q_q_r<25, 15, 9, 32, 6>(a, b, c, d);
315 };
316
317 // Left elements
318 128 const uint8_t s0l = src_row0[0], s1l = src_row1[0];
319 128 dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l);
320 128 dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l);
321 128 dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l);
322 128 dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l);
323
324 // Right elements
325 128 const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1];
326 128 const size_t dr0 = dst_width - 2;
327 128 const size_t dr1 = dst_width - 1;
328 128 dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r);
329 128 dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r);
330 128 dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r);
331 128 dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r);
332
333 // Middle elements
334 128 size_t src_x = 0;
335
2/2
✓ Branch 0 taken 1106 times.
✓ Branch 1 taken 128 times.
1234 for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) {
336 1106 size_t dst_x = src_x * 4 + 2;
337
338 1106 uint8x8_t a = vld1_u8(src_row0 + src_x);
339 1106 uint8x8_t b = vld1_u8(src_row0 + src_x + 1);
340 1106 uint8x8_t c = vld1_u8(src_row1 + src_x);
341 1106 uint8x8_t d = vld1_u8(src_row1 + src_x + 1);
342
343 1106 vst4_u8(dst_row0 + dst_x, (uint8x8x4_t{
344 lerp2d_vector_49_7_7_1(a, b, c, d),
345 lerp2d_vector(35, a, 21, b, 5, c, 3, d),
346 lerp2d_vector(21, a, 35, b, 3, c, 5, d),
347 lerp2d_vector_49_7_7_1(b, a, d, c),
348 }));
349 1106 vst4_u8(dst_row1 + dst_x, (uint8x8x4_t{
350 lerp2d_vector(35, a, 5, b, 21, c, 3, d),
351 lerp2d_vector_25_15_15_9(a, b, c, d),
352 lerp2d_vector_25_15_15_9(b, a, d, c),
353 lerp2d_vector(5, a, 35, b, 3, c, 21, d),
354 }));
355 1106 vst4_u8(dst_row2 + dst_x, (uint8x8x4_t{
356 lerp2d_vector(21, a, 3, b, 35, c, 5, d),
357 lerp2d_vector_25_15_15_9(c, a, d, b),
358 lerp2d_vector_25_15_15_9(d, b, c, a),
359 lerp2d_vector(3, a, 21, b, 5, c, 35, d),
360 }));
361 1106 vst4_u8(dst_row3 + dst_x, (uint8x8x4_t{
362 lerp2d_vector_49_7_7_1(c, a, d, b),
363 lerp2d_vector(5, a, 3, b, 35, c, 21, d),
364 lerp2d_vector(3, a, 5, b, 21, c, 35, d),
365 lerp2d_vector_49_7_7_1(d, b, c, a),
366 }));
367 1106 }
368
2/2
✓ Branch 0 taken 280 times.
✓ Branch 1 taken 128 times.
408 for (; src_x + 1 < src_width; ++src_x) {
369 280 size_t dst_x = src_x * 4 + 2;
370 560 const uint8_t a = src_row0[src_x], b = src_row0[src_x + 1],
371 560 c = src_row1[src_x], d = src_row1[src_x + 1];
372
373 280 dst_row0[dst_x + 0] = lerp2d_scalar(49, a, 7, b, 7, c, 1, d);
374 280 dst_row0[dst_x + 1] = lerp2d_scalar(35, a, 21, b, 5, c, 3, d);
375 280 dst_row0[dst_x + 2] = lerp2d_scalar(21, a, 35, b, 3, c, 5, d);
376 280 dst_row0[dst_x + 3] = lerp2d_scalar(7, a, 49, b, 1, c, 7, d);
377 280 dst_row1[dst_x + 0] = lerp2d_scalar(35, a, 5, b, 21, c, 3, d);
378 280 dst_row1[dst_x + 1] = lerp2d_scalar(25, a, 15, b, 15, c, 9, d);
379 280 dst_row1[dst_x + 2] = lerp2d_scalar(15, a, 25, b, 9, c, 15, d);
380 280 dst_row1[dst_x + 3] = lerp2d_scalar(5, a, 35, b, 3, c, 21, d);
381 280 dst_row2[dst_x + 0] = lerp2d_scalar(21, a, 3, b, 35, c, 5, d);
382 280 dst_row2[dst_x + 1] = lerp2d_scalar(15, a, 9, b, 25, c, 15, d);
383 280 dst_row2[dst_x + 2] = lerp2d_scalar(9, a, 15, b, 15, c, 25, d);
384 280 dst_row2[dst_x + 3] = lerp2d_scalar(3, a, 21, b, 5, c, 35, d);
385 280 dst_row3[dst_x + 0] = lerp2d_scalar(7, a, 1, b, 49, c, 7, d);
386 280 dst_row3[dst_x + 1] = lerp2d_scalar(5, a, 3, b, 35, c, 21, d);
387 280 dst_row3[dst_x + 2] = lerp2d_scalar(3, a, 5, b, 21, c, 35, d);
388 280 dst_row3[dst_x + 3] = lerp2d_scalar(1, a, 7, b, 7, c, 49, d);
389 280 }
390 128 };
391
392 // Top rows
393
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_begin == 0)) {
394 33 process_edge_row(src, dst);
395 33 memcpy(dst + dst_stride, dst, dst_stride);
396 33 }
397
398 // Middle rows
399
2/2
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
162 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
400 128 size_t dst_y = src_y * 4 + 2;
401 128 const uint8_t *src_row0 = src + src_stride * src_y;
402 128 const uint8_t *src_row1 = src_row0 + src_stride;
403 128 uint8_t *dst_row0 = dst + dst_stride * dst_y;
404 128 uint8_t *dst_row1 = dst_row0 + dst_stride;
405 128 uint8_t *dst_row2 = dst_row1 + dst_stride;
406 128 uint8_t *dst_row3 = dst_row2 + dst_stride;
407
408 128 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
409 128 }
410
411 // Bottom rows
412
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_end == src_height)) {
413 66 process_edge_row(src + src_stride * (src_height - 1),
414 33 dst + dst_stride * (dst_height - 2));
415 99 memcpy(dst + dst_stride * (dst_height - 1),
416 66 dst + dst_stride * (dst_height - 2), dst_stride);
417 33 }
418
419 34 return KLEIDICV_OK;
420 34 }
421
422 KLEIDICV_TARGET_FN_ATTRS
423 95 kleidicv_error_t resize_linear_stripe_u8(const uint8_t *src, size_t src_stride,
424 size_t src_width, size_t src_height,
425 size_t y_begin, size_t y_end,
426 uint8_t *dst, size_t dst_stride,
427 size_t dst_width, size_t dst_height) {
428
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 94 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 94 times.
95 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
429
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 93 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 93 times.
94 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
430
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 92 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 91 times.
93 CHECK_IMAGE_SIZE(dst_width, dst_height);
431
432
4/4
✓ Branch 0 taken 89 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 88 times.
91 if (src_width == 0 || src_height == 0) {
433 3 return KLEIDICV_OK;
434 }
435
3/4
✓ Branch 0 taken 54 times.
✓ Branch 1 taken 34 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.
88 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
436 108 return resize_2x2_u8(src, src_stride, src_width, src_height, y_begin, y_end,
437 54 dst, dst_stride);
438 }
439
2/4
✓ Branch 0 taken 34 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 34 times.
34 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
440 68 return resize_4x4_u8(src, src_stride, src_width, src_height, y_begin, y_end,
441 34 dst, dst_stride);
442 }
443 // resize_linear_u8_is_implemented checked the kernel size already.
444 // GCOVR_EXCL_START
445 assert(!"resize ratio not implemented");
446 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
447 // GCOVR_EXCL_STOP
448 95 }
449
450 50 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32(
451 const float *src, size_t src_stride, size_t src_width, size_t src_height,
452 size_t y_begin, size_t y_end, float *dst, size_t dst_stride) {
453 50 size_t dst_width = src_width * 2;
454 50 src_stride /= sizeof(float);
455 50 dst_stride /= sizeof(float);
456
457 874 auto lerp1d_scalar = [](float near, float far) {
458 824 return near * 0.75F + far * 0.25F;
459 };
460
461 2250 auto lerp1d_vector = [](float32x4_t near, float32x4_t far) {
462 2200 return vmlaq_n_f32(vmulq_n_f32(near, 0.75F), far, 0.25F);
463 };
464
465 1282 auto lerp2d_scalar = [](float near, float mid_a, float mid_b, float far) {
466 1232 return near * 0.5625F + mid_a * 0.1875F + mid_b * 0.1875F + far * 0.0625F;
467 };
468
469 8922 auto lerp2d_vector = [](float32x4_t a, float32x4_t b, float32x4_t c,
470 float32x4_t d) {
471 8872 return vmlaq_n_f32(
472 8872 vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, 0.5625F), b, 0.1875F), c,
473 0.1875F),
474 8872 d, 0.0625F);
475 };
476
477 // Handle top or bottom edge
478 148 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
479 const float *src_row, float *dst_row) {
480 // Left element
481 98 dst_row[0] = src_row[0];
482
483 // Right element
484 98 dst_row[dst_width - 1] = src_row[src_width - 1];
485
486 // Middle elements
487 98 size_t src_x = 0;
488
2/2
✓ Branch 0 taken 1100 times.
✓ Branch 1 taken 98 times.
1198 for (; src_x + 4 < src_width; src_x += 4) {
489 1100 size_t dst_x = src_x * 2 + 1;
490 1100 float32x4_t src_left = vld1q_f32(src_row + src_x);
491 1100 float32x4_t src_right = vld1q_f32(src_row + src_x + 1);
492
493 1100 float32x4_t dst_left = lerp1d_vector(src_left, src_right);
494 1100 float32x4_t dst_right = lerp1d_vector(src_right, src_left);
495
496 1100 vst2q_f32(dst_row + dst_x, (float32x4x2_t{dst_left, dst_right}));
497 1100 }
498
2/2
✓ Branch 0 taken 116 times.
✓ Branch 1 taken 98 times.
214 for (; src_x + 1 < src_width; ++src_x) {
499 116 size_t dst_x = src_x * 2 + 1;
500 116 const float src_left = src_row[src_x], src_right = src_row[src_x + 1];
501 116 dst_row[dst_x] = lerp1d_scalar(src_left, src_right);
502 116 dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left);
503 116 }
504 98 };
505
506 198 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar,
507 lerp2d_vector](const float *src_row0,
508 const float *src_row1, float *dst_row0,
509 float *dst_row1) {
510 // Left element
511 148 dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]);
512 148 dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]);
513
514 // Right element
515 148 dst_row0[dst_width - 1] =
516 148 lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]);
517 148 dst_row1[dst_width - 1] =
518 148 lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]);
519
520 // Middle elements
521 148 size_t src_x = 0;
522
2/2
✓ Branch 0 taken 2218 times.
✓ Branch 1 taken 148 times.
2366 for (; src_x + 4 < src_width; src_x += 4) {
523 2218 size_t dst_x = src_x * 2 + 1;
524
525 2218 float32x4_t a = vld1q_f32(src_row0 + src_x);
526 2218 float32x4_t b = vld1q_f32(src_row0 + src_x + 1);
527 2218 float32x4_t c = vld1q_f32(src_row1 + src_x);
528 2218 float32x4_t d = vld1q_f32(src_row1 + src_x + 1);
529
530 2218 vst2q_f32(dst_row0 + dst_x, (float32x4x2_t{lerp2d_vector(a, b, c, d),
531 lerp2d_vector(b, a, d, c)}));
532 2218 vst2q_f32(dst_row1 + dst_x, (float32x4x2_t{lerp2d_vector(c, a, d, b),
533 lerp2d_vector(d, b, c, a)}));
534 2218 }
535
2/2
✓ Branch 0 taken 308 times.
✓ Branch 1 taken 148 times.
456 for (; src_x + 1 < src_width; ++src_x) {
536 308 size_t dst_x = src_x * 2 + 1;
537 616 const float a = src_row0[src_x], b = src_row0[src_x + 1],
538 616 c = src_row1[src_x], d = src_row1[src_x + 1];
539 308 dst_row0[dst_x] = lerp2d_scalar(a, b, c, d);
540 308 dst_row0[dst_x + 1] = lerp2d_scalar(b, a, d, c);
541 308 dst_row1[dst_x] = lerp2d_scalar(c, a, d, b);
542 308 dst_row1[dst_x + 1] = lerp2d_scalar(d, b, c, a);
543 308 }
544 148 };
545
546 // Top row
547
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
50 if (KLEIDICV_LIKELY(y_begin == 0)) {
548 49 process_edge_row(src, dst);
549 49 }
550
551 // Middle rows
552
2/2
✓ Branch 0 taken 148 times.
✓ Branch 1 taken 50 times.
198 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
553 148 size_t dst_y = src_y * 2 + 1;
554 148 const float *src_row0 = src + src_stride * src_y;
555 148 const float *src_row1 = src_row0 + src_stride;
556 148 float *dst_row0 = dst + dst_stride * dst_y;
557 148 float *dst_row1 = dst_row0 + dst_stride;
558
559 148 process_row(src_row0, src_row1, dst_row0, dst_row1);
560 148 }
561
562 // Bottom row
563
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
50 if (KLEIDICV_LIKELY(y_end == src_height)) {
564 98 process_edge_row(src + src_stride * (src_height - 1),
565 49 dst + dst_stride * (src_height * 2 - 1));
566 49 }
567
568 50 return KLEIDICV_OK;
569 50 }
570
571 38 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32(
572 const float *src, size_t src_stride, size_t src_width, size_t src_height,
573 size_t y_begin, size_t y_end, float *dst, size_t dst_stride) {
574 using T = float;
575 38 size_t dst_height = src_height * 4;
576 38 size_t dst_width = src_width * 4;
577 38 src_stride /= sizeof(T);
578 38 dst_stride /= sizeof(T);
579
580 1398 auto lerp1d_scalar = [](T coeff_a, T a, T coeff_b, T b) {
581 1360 return coeff_a * a + coeff_b * b;
582 };
583 22182 auto lerp1d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b) {
584 22144 return vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b);
585 };
586 4198 auto lerp2d_scalar = [](T coeff_a, T a, T coeff_b, T b, T coeff_c, T c,
587 T coeff_d, T d) {
588 4160 return coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d;
589 };
590 17782 auto lerp2d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b,
591 T coeff_c, float32x4_t c, T coeff_d, float32x4_t d) {
592 17744 return vmlaq_n_f32(
593 35488 vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b), c,
594 17744 coeff_c),
595 17744 d, coeff_d);
596 };
597 // Handle top or bottom edge
598 112 auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector](
599 const T *src_row, T *dst_row) {
600 // Left elements
601 74 dst_row[1] = dst_row[0] = src_row[0];
602
603 // Right elements
604 74 dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
605
606 // Middle elements
607 74 size_t src_x = 0;
608
2/2
✓ Branch 0 taken 1100 times.
✓ Branch 1 taken 74 times.
1174 for (; src_x + 4 < src_width; src_x += 4) {
609 1100 size_t dst_x = src_x * 4 + 2;
610 1100 float32x4_t a = vld1q_f32(src_row + src_x);
611 1100 float32x4_t b = vld1q_f32(src_row + src_x + 1);
612 1100 vst4q_f32(dst_row + dst_x,
613 (float32x4x4_t{lerp1d_vector(0.875F, a, 0.125F, b),
614 lerp1d_vector(0.625F, a, 0.375F, b),
615 lerp1d_vector(0.375F, a, 0.625F, b),
616 lerp1d_vector(0.125F, a, 0.875F, b)}));
617 1100 }
618
2/2
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 74 times.
150 for (; src_x + 1 < src_width; ++src_x) {
619 76 size_t dst_x = src_x * 4 + 2;
620 76 const T a = src_row[src_x], b = src_row[src_x + 1];
621 76 dst_row[dst_x + 0] = lerp1d_scalar(0.875F, a, 0.125F, b);
622 76 dst_row[dst_x + 1] = lerp1d_scalar(0.625F, a, 0.375F, b);
623 76 dst_row[dst_x + 2] = lerp1d_scalar(0.375F, a, 0.625F, b);
624 76 dst_row[dst_x + 3] = lerp1d_scalar(0.125F, a, 0.875F, b);
625 76 }
626 74 };
627
628 170 auto process_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector,
629 lerp2d_scalar, lerp2d_vector](
630 const T *src_row0, const T *src_row1, T *dst_row0,
631 T *dst_row1, T *dst_row2, T *dst_row3) {
632 // Left elements
633 132 const T s0l = src_row0[0], s1l = src_row1[0];
634 132 dst_row0[0] = dst_row0[1] = lerp1d_scalar(0.875F, s0l, 0.125F, s1l);
635 132 dst_row1[0] = dst_row1[1] = lerp1d_scalar(0.625F, s0l, 0.375F, s1l);
636 132 dst_row2[0] = dst_row2[1] = lerp1d_scalar(0.375F, s0l, 0.625F, s1l);
637 132 dst_row3[0] = dst_row3[1] = lerp1d_scalar(0.125F, s0l, 0.875F, s1l);
638
639 // Right elements
640 132 const T s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1];
641 132 const size_t dr0 = dst_width - 2;
642 132 const size_t dr1 = dst_width - 1;
643 132 dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(0.875F, s0r, 0.125F, s1r);
644 132 dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(0.625F, s0r, 0.375F, s1r);
645 132 dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(0.375F, s0r, 0.625F, s1r);
646 132 dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(0.125F, s0r, 0.875F, s1r);
647
648 // Middle elements
649 132 size_t src_x = 0;
650
2/2
✓ Branch 0 taken 2218 times.
✓ Branch 1 taken 132 times.
2350 for (; src_x + 4 < src_width; src_x += 4) {
651 2218 size_t dst_x = src_x * 4 + 2;
652
653 2218 float32x4_t a = vld1q_f32(src_row0 + src_x);
654 2218 float32x4_t b = vld1q_f32(src_row0 + src_x + 1);
655 2218 float32x4_t c = vld1q_f32(src_row1 + src_x);
656 2218 float32x4_t d = vld1q_f32(src_row1 + src_x + 1);
657
658 2218 float32x4x4_t dst_a{
659 8872 lerp2d_vector(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d),
660 2218 lerp2d_vector(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d),
661 2218 lerp2d_vector(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d),
662 2218 lerp2d_vector(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d),
663 };
664 2218 float32x4x4_t dst_d{
665 8872 lerp2d_vector(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d),
666 2218 lerp2d_vector(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d),
667 2218 lerp2d_vector(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d),
668 2218 lerp2d_vector(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d),
669 };
670 2218 const float one_3rd = 0.3333333333333333F;
671 2218 const float two_3rd = 0.6666666666666667F;
672 2218 vst4q_f32(dst_row0 + dst_x, dst_a);
673 2218 vst4q_f32(dst_row1 + dst_x,
674 (float32x4x4_t{
675 lerp1d_vector(two_3rd, dst_a.val[0], one_3rd, dst_d.val[0]),
676 lerp1d_vector(two_3rd, dst_a.val[1], one_3rd, dst_d.val[1]),
677 lerp1d_vector(two_3rd, dst_a.val[2], one_3rd, dst_d.val[2]),
678 lerp1d_vector(two_3rd, dst_a.val[3], one_3rd, dst_d.val[3]),
679 }));
680 2218 vst4q_f32(dst_row2 + dst_x,
681 (float32x4x4_t{
682 lerp1d_vector(one_3rd, dst_a.val[0], two_3rd, dst_d.val[0]),
683 lerp1d_vector(one_3rd, dst_a.val[1], two_3rd, dst_d.val[1]),
684 lerp1d_vector(one_3rd, dst_a.val[2], two_3rd, dst_d.val[2]),
685 lerp1d_vector(one_3rd, dst_a.val[3], two_3rd, dst_d.val[3]),
686 }));
687 2218 vst4q_f32(dst_row3 + dst_x, dst_d);
688 2218 }
689
690
2/2
✓ Branch 0 taken 260 times.
✓ Branch 1 taken 132 times.
392 for (; src_x + 1 < src_width; ++src_x) {
691 260 size_t dst_x = src_x * 4 + 2;
692 260 const T a = src_row0[src_x], b = src_row0[src_x + 1], c = src_row1[src_x],
693 260 d = src_row1[src_x + 1];
694
695 260 dst_row0[dst_x + 0] =
696 260 lerp2d_scalar(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d);
697 260 dst_row0[dst_x + 1] =
698 260 lerp2d_scalar(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d);
699 260 dst_row0[dst_x + 2] =
700 260 lerp2d_scalar(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d);
701 260 dst_row0[dst_x + 3] =
702 260 lerp2d_scalar(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d);
703 260 dst_row1[dst_x + 0] =
704 260 lerp2d_scalar(0.546875F, a, 0.078125F, b, 0.328125F, c, 0.046875F, d);
705 260 dst_row1[dst_x + 1] =
706 260 lerp2d_scalar(0.390625F, a, 0.234375F, b, 0.234375F, c, 0.140625F, d);
707 260 dst_row1[dst_x + 2] =
708 260 lerp2d_scalar(0.234375F, a, 0.390625F, b, 0.140625F, c, 0.234375F, d);
709 260 dst_row1[dst_x + 3] =
710 260 lerp2d_scalar(0.078125F, a, 0.546875F, b, 0.046875F, c, 0.328125F, d);
711 260 dst_row2[dst_x + 0] =
712 260 lerp2d_scalar(0.328125F, a, 0.046875F, b, 0.546875F, c, 0.078125F, d);
713 260 dst_row2[dst_x + 1] =
714 260 lerp2d_scalar(0.234375F, a, 0.140625F, b, 0.390625F, c, 0.234375F, d);
715 260 dst_row2[dst_x + 2] =
716 260 lerp2d_scalar(0.140625F, a, 0.234375F, b, 0.234375F, c, 0.390625F, d);
717 260 dst_row2[dst_x + 3] =
718 260 lerp2d_scalar(0.046875F, a, 0.328125F, b, 0.078125F, c, 0.546875F, d);
719 260 dst_row3[dst_x + 0] =
720 260 lerp2d_scalar(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d);
721 260 dst_row3[dst_x + 1] =
722 260 lerp2d_scalar(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d);
723 260 dst_row3[dst_x + 2] =
724 260 lerp2d_scalar(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d);
725 260 dst_row3[dst_x + 3] =
726 260 lerp2d_scalar(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d);
727 260 }
728 132 };
729
730 // Top rows
731
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 37 times.
38 if (KLEIDICV_LIKELY(y_begin == 0)) {
732 37 process_edge_row(src, dst);
733 37 memcpy(dst + dst_stride, dst, dst_stride * sizeof(T));
734 37 }
735
736 // Middle rows
737
2/2
✓ Branch 0 taken 132 times.
✓ Branch 1 taken 38 times.
170 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
738 132 size_t dst_y = src_y * 4 + 2;
739 132 const T *src_row0 = src + src_stride * src_y;
740 132 const T *src_row1 = src_row0 + src_stride;
741 132 T *dst_row0 = dst + dst_stride * dst_y;
742 132 T *dst_row1 = dst_row0 + dst_stride;
743 132 T *dst_row2 = dst_row1 + dst_stride;
744 132 T *dst_row3 = dst_row2 + dst_stride;
745
746 132 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
747 132 }
748
749 // Bottom rows
750
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 37 times.
38 if (KLEIDICV_LIKELY(y_end == src_height)) {
751 74 process_edge_row(src + src_stride * (src_height - 1),
752 37 dst + dst_stride * (dst_height - 2));
753 111 memcpy(dst + dst_stride * (dst_height - 1),
754 74 dst + dst_stride * (dst_height - 2), dst_stride * sizeof(T));
755 37 }
756
757 38 return KLEIDICV_OK;
758 38 }
759
760 34 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32(
761 const float *src, size_t src_stride, size_t src_width, size_t src_height,
762 size_t y_begin, size_t y_end, float *dst, size_t dst_stride) {
763 34 size_t dst_width = src_width * 8;
764 34 size_t dst_height = src_height * 8;
765 34 src_stride /= sizeof(float);
766 34 dst_stride /= sizeof(float);
767
768 34 float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0,
769 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0};
770 34 float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0,
771 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0};
772 34 float32x4_t coeffs_a0 = vld1q_f32(&coeffs_a[0]);
773 34 float32x4_t coeffs_a1 = vld1q_f32(&coeffs_a[4]);
774 34 float32x4_t coeffs_b0 = vld1q_f32(&coeffs_b[0]);
775 34 float32x4_t coeffs_b1 = vld1q_f32(&coeffs_b[4]);
776
777 2082 auto lerp1d_vector_n = [](float p, float32x4_t a, float q, float32x4_t b) {
778 2048 return vmlaq_n_f32(vmulq_n_f32(a, p), b, q);
779 };
780
781 109570 auto lerp1d_vector_n2 = [](float32x4_t a, float q, float32x4_t b) {
782 109536 return vmlaq_n_f32(a, b, q);
783 };
784
785 8970 auto lerp1d_vector = [](float32x4_t p, float32x4_t a, float32x4_t q,
786 float32x4_t b) {
787 8936 return vmlaq_f32(vmulq_f32(a, p), b, q);
788 };
789
790 // Handle top or bottom edge
791 34 auto process_edge_row =
792 100 [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0,
793 &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) {
794 // Left elements
795 66 dst_row[3] = dst_row[2] = dst_row[1] = dst_row[0] = src_row[0];
796 66 dst_row[dst_stride + 3] = dst_row[dst_stride + 2] =
797 66 dst_row[dst_stride + 1] = dst_row[dst_stride] = src_row[0];
798 66 dst_row[2 * dst_stride + 3] = dst_row[2 * dst_stride + 2] =
799 66 dst_row[2 * dst_stride + 1] = dst_row[2 * dst_stride] = src_row[0];
800 66 dst_row[3 * dst_stride + 3] = dst_row[3 * dst_stride + 2] =
801 66 dst_row[3 * dst_stride + 1] = dst_row[3 * dst_stride] = src_row[0];
802
803 // Right elements
804 66 float *dst_right = dst_row + dst_width - 4;
805 66 dst_right[3] = dst_right[2] = dst_right[1] = dst_right[0] =
806 66 src_row[src_width - 1];
807 66 dst_right[dst_stride + 3] = dst_right[dst_stride + 2] =
808 66 dst_right[dst_stride + 1] = dst_right[dst_stride] =
809 66 src_row[src_width - 1];
810 66 dst_right[2 * dst_stride + 3] = dst_right[2 * dst_stride + 2] =
811 66 dst_right[2 * dst_stride + 1] = dst_right[2 * dst_stride] =
812 66 src_row[src_width - 1];
813 66 dst_right[3 * dst_stride + 3] = dst_right[3 * dst_stride + 2] =
814 66 dst_right[3 * dst_stride + 1] = dst_right[3 * dst_stride] =
815 66 src_row[src_width - 1];
816
817 // Middle elements
818 66 float32x4_t a, b = vdupq_n_f32(src_row[0]);
819
2/2
✓ Branch 0 taken 66 times.
✓ Branch 1 taken 4468 times.
4534 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
820 4468 a = b;
821 4468 b = vdupq_n_f32(src_row[src_x + 1]);
822 4468 float *dst_row0 = dst_row + src_x * 8 + 4;
823 4468 float *dst_row1 = dst_row0 + dst_stride;
824 4468 float *dst_row2 = dst_row1 + dst_stride;
825 4468 float *dst_row3 = dst_row2 + dst_stride;
826 4468 float32x4_t dst = lerp1d_vector(coeffs_a0, a, coeffs_b0, b);
827 4468 vst1q(dst_row0, dst);
828 4468 vst1q(dst_row1, dst);
829 4468 vst1q(dst_row2, dst);
830 4468 vst1q(dst_row3, dst);
831 4468 dst = lerp1d_vector(coeffs_a1, a, coeffs_b1, b);
832 4468 vst1q(dst_row0 + 4, dst);
833 4468 vst1q(dst_row1 + 4, dst);
834 4468 vst1q(dst_row2 + 4, dst);
835 4468 vst1q(dst_row3 + 4, dst);
836 4468 }
837 66 };
838
839 34 float32x4_t coeffs_p0 = vmulq_n_f32(coeffs_a0, 15.0 / 16);
840 34 float32x4_t coeffs_q0 = vmulq_n_f32(coeffs_b0, 15.0 / 16);
841 34 float32x4_t coeffs_r0 = vmulq_n_f32(coeffs_a0, 1.0 / 16);
842 34 float32x4_t coeffs_s0 = vmulq_n_f32(coeffs_b0, 1.0 / 16);
843 34 float32x4_t coeffs_p1 = vmulq_n_f32(coeffs_a1, 15.0 / 16);
844 34 float32x4_t coeffs_q1 = vmulq_n_f32(coeffs_b1, 15.0 / 16);
845 34 float32x4_t coeffs_r1 = vmulq_n_f32(coeffs_a1, 1.0 / 16);
846 34 float32x4_t coeffs_s1 = vmulq_n_f32(coeffs_b1, 1.0 / 16);
847
848 36546 auto lerp2d_vector = [](float32x4_t a, float32x4_t p, float32x4_t b,
849 float32x4_t q, float32x4_t c, float32x4_t r,
850 float32x4_t d, float32x4_t s) {
851 36512 return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(a, p), b, q), c, r), d, s);
852 };
853
854 162 auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n,
855 lerp1d_vector_n2, &coeffs_p0, &coeffs_q0, &coeffs_r0,
856 &coeffs_s0, &coeffs_p1, &coeffs_q1, &coeffs_r1,
857 &coeffs_s1](const float *src_row0, const float *src_row1,
858 float *dst_row0, size_t dst_stride) {
859 // Left elements
860 128 float32x4_t s0 = vdupq_n_f32(src_row0[0]);
861 128 float32x4_t s1 = vdupq_n_f32(src_row1[0]);
862 128 float *dst_row = dst_row0;
863
2/2
✓ Branch 0 taken 1024 times.
✓ Branch 1 taken 128 times.
1152 for (size_t i = 0; i < 8; ++i) {
864 2048 vst1q(dst_row,
865 2048 lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0,
866 1024 static_cast<float>(i * 2 + 1) / 16.0F, s1));
867 1024 dst_row += dst_stride;
868 1024 }
869
870 // Middle elements
871 128 dst_row0 += 4;
872 128 float *dst_row1 = dst_row0 + dst_stride;
873 128 float *dst_row2 = dst_row1 + dst_stride;
874 128 float *dst_row3 = dst_row2 + dst_stride;
875 128 float *dst_row4 = dst_row3 + dst_stride;
876 128 float *dst_row5 = dst_row4 + dst_stride;
877 128 float *dst_row6 = dst_row5 + dst_stride;
878 128 float *dst_row7 = dst_row6 + dst_stride;
879 128 float32x4_t a, b = s0;
880 128 float32x4_t c, d = s1;
881
2/2
✓ Branch 0 taken 9128 times.
✓ Branch 1 taken 128 times.
9256 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
882 9128 a = b;
883 9128 b = vdupq_n_f32(src_row0[src_x + 1]);
884 9128 c = d;
885 9128 d = vdupq_n_f32(src_row1[src_x + 1]);
886 9128 float32x4x2_t dst_0;
887 9128 dst_0.val[0] =
888 9128 lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d);
889 9128 dst_0.val[1] =
890 9128 lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d);
891
892 9128 neon::VecTraits<float>::store(dst_0, dst_row0);
893 9128 float32x4x2_t dst_7;
894 9128 dst_7.val[0] =
895 9128 lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d);
896 9128 dst_7.val[1] =
897 9128 lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d);
898
899 9128 neon::VecTraits<float>::store(dst_7, dst_row7);
900 9128 float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]);
901 9128 float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]);
902
903 9128 float32x4x2_t dst;
904 9128 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0);
905 9128 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1);
906
907 9128 neon::VecTraits<float>::store(dst, dst_row1);
908 9128 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0);
909 9128 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1);
910
911 9128 neon::VecTraits<float>::store(dst, dst_row2);
912 9128 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0);
913 9128 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1);
914
915 9128 neon::VecTraits<float>::store(dst, dst_row3);
916 9128 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0);
917 9128 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1);
918
919 9128 neon::VecTraits<float>::store(dst, dst_row4);
920 9128 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0);
921 9128 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1);
922
923 9128 neon::VecTraits<float>::store(dst, dst_row5);
924 9128 dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0);
925 9128 dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1);
926
927 9128 neon::VecTraits<float>::store(dst, dst_row6);
928 9128 dst_row0 += 8;
929 9128 dst_row1 += 8;
930 9128 dst_row2 += 8;
931 9128 dst_row3 += 8;
932 9128 dst_row4 += 8;
933 9128 dst_row5 += 8;
934 9128 dst_row6 += 8;
935 9128 dst_row7 += 8;
936 9128 }
937
938 // Right elements
939 128 s0 = b;
940 128 s1 = d;
941 128 dst_row = dst_row0;
942
2/2
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 1024 times.
1152 for (size_t i = 0; i < 8; ++i) {
943 2048 vst1q(dst_row,
944 2048 lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0,
945 1024 static_cast<float>(i * 2 + 1) / 16.0F, s1));
946 1024 dst_row += dst_stride;
947 1024 }
948 128 };
949
950 // Top rows
951
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_begin == 0)) {
952 33 process_edge_row(src, dst, dst_stride);
953 33 }
954
955 // Middle rows
956
2/2
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
162 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
957 128 size_t dst_y = src_y * 8 + 4;
958 128 const float *src_row0 = src + src_stride * src_y;
959 128 const float *src_row1 = src_row0 + src_stride;
960 128 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride);
961 128 }
962
963 // Bottom rows
964
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_end == src_height)) {
965 66 process_edge_row(src + src_stride * (src_height - 1),
966 33 dst + dst_stride * (dst_height - 4), dst_stride);
967 33 }
968
969 34 return KLEIDICV_OK;
970 34 }
971
972 129 kleidicv_error_t resize_linear_stripe_f32(const float *src, size_t src_stride,
973 size_t src_width, size_t src_height,
974 size_t y_begin, size_t y_end,
975 float *dst, size_t dst_stride,
976 size_t dst_width, size_t dst_height) {
977
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 128 times.
129 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
978
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 127 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 127 times.
128 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
979
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 126 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 125 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 125 times.
127 CHECK_IMAGE_SIZE(dst_width, dst_height);
980
981
4/4
✓ Branch 0 taken 123 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 122 times.
125 if (src_width == 0 || src_height == 0) {
982 3 return KLEIDICV_OK;
983 }
984
3/4
✓ Branch 0 taken 50 times.
✓ Branch 1 taken 72 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 50 times.
122 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
985 100 return resize_2x2_f32(src, src_stride, src_width, src_height, y_begin,
986 50 y_end, dst, dst_stride);
987 }
988
3/4
✓ Branch 0 taken 38 times.
✓ Branch 1 taken 34 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 38 times.
72 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
989 76 return resize_4x4_f32(src, src_stride, src_width, src_height, y_begin,
990 38 y_end, dst, dst_stride);
991 }
992
2/4
✓ Branch 0 taken 34 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 34 times.
34 if (src_width * 8 == dst_width && src_height * 8 == dst_height) {
993 68 return resize_8x8_f32(src, src_stride, src_width, src_height, y_begin,
994 34 y_end, dst, dst_stride);
995 }
996 // resize_linear_f32_is_implemented checked the kernel size already.
997 // GCOVR_EXCL_START
998 assert(!"resize ratio not implemented");
999 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1000 // GCOVR_EXCL_STOP
1001 129 }
1002
1003 } // namespace kleidicv::neon
1004