KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/resize/resize_linear_sc.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 842 842 100.0%
Functions: 64 70 91.4%
Branches: 129 136 94.9%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RESIZE_LINEAR_SC_H
6 #define KLEIDICV_RESIZE_LINEAR_SC_H
7
8 #include <cassert>
9
10 #include "kleidicv/kleidicv.h"
11 #include "kleidicv/sve2.h"
12
13 namespace KLEIDICV_TARGET_NAMESPACE {
14
15 216 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc(
16 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
17 size_t y_begin, size_t y_end, uint8_t *dst,
18 size_t dst_stride) KLEIDICV_STREAMING {
19 216 size_t dst_width = src_width * 2;
20 216 size_t dst_height = src_height * 2;
21
22 4024 auto lerp1d_vector = [](svuint8_t near, svuint8_t far) KLEIDICV_STREAMING {
23 // near * 3
24 3808 svuint16_t near3b = svmullb(near, uint8_t{3});
25 3808 svuint16_t near3t = svmullt(near, uint8_t{3});
26
27 // near * 3 + far
28 3808 svuint16_t near3_far_b = svaddwb(near3b, far);
29 3808 svuint16_t near3_far_t = svaddwt(near3t, far);
30
31 // near * 3 + far + 2
32 3808 svuint16_t near3_far_2b = svaddwb(near3_far_b, uint8_t{2});
33 3808 svuint16_t near3_far_2t = svaddwt(near3_far_t, uint8_t{2});
34
35 // (near * 3 + far + 2) / 4
36 3808 svuint8_t near3_far_2_div4 = svshrnb_n_u16(near3_far_2b, 2);
37 3808 near3_far_2_div4 = svshrnt_n_u16(near3_far_2_div4, near3_far_2t, 2);
38 7616 return near3_far_2_div4;
39 3808 };
40
41 5432 auto lerp2d_vector = [](svbool_t pg, svuint8_t near, svuint8_t mid_a,
42 svuint8_t mid_b, svuint8_t far) KLEIDICV_STREAMING {
43 // near * 9
44 5216 svuint16_t near9b = svmullb(near, uint8_t{9});
45 5216 svuint16_t near9t = svmullt(near, uint8_t{9});
46
47 // mid_a + mid_b
48 5216 svuint16_t midb = svaddlb(mid_a, mid_b);
49 5216 svuint16_t midt = svaddlt(mid_a, mid_b);
50
51 // near * 9 + (mid_a + mid_b) * 3
52 5216 svuint16_t near9_mid3b = svmla_x(pg, near9b, midb, uint16_t{3});
53 5216 svuint16_t near9_mid3t = svmla_x(pg, near9t, midt, uint16_t{3});
54
55 // near * 9 + (mid_a + mid_b) * 3 + far
56 5216 svuint16_t near9_mid3_far_b = svaddwb(near9_mid3b, far);
57 5216 svuint16_t near9_mid3_far_t = svaddwt(near9_mid3t, far);
58
59 // near * 9 + (mid_a + mid_b) * 3 + far + 8
60 5216 svuint16_t near9_mid3_far_8b = svaddwb(near9_mid3_far_b, uint8_t{8});
61 5216 svuint16_t near9_mid3_far_8t = svaddwt(near9_mid3_far_t, uint8_t{8});
62
63 // (near * 9 + (mid_a + mid_b) * 3 + far + 8) / 16
64 5216 svuint8_t near9_mid3_far_8_div16 = svshrnb_n_u16(near9_mid3_far_8b, 4);
65 5216 near9_mid3_far_8_div16 =
66 5216 svshrnt_n_u16(near9_mid3_far_8_div16, near9_mid3_far_8t, 4);
67 10432 return near9_mid3_far_8_div16;
68 5216 };
69
70 // Handle top or bottom edge
71 640 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
72 const uint8_t *src_row,
73 uint8_t *dst_row) KLEIDICV_STREAMING {
74 // Left element
75 424 dst_row[0] = src_row[0];
76
77 // Right element
78 424 dst_row[dst_width - 1] = src_row[src_width - 1];
79
80
2/2
✓ Branch 0 taken 424 times.
✓ Branch 1 taken 720 times.
1144 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
81 720 size_t dst_x = src_x * 2 + 1;
82
83 720 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
84
85 720 svuint8_t src_left = svld1_u8(pg, src_row + src_x);
86 720 svuint8_t src_right = svld1_u8(pg, src_row + src_x + 1);
87
88 720 svuint8_t dst_left = lerp1d_vector(src_left, src_right);
89 720 svuint8_t dst_right = lerp1d_vector(src_right, src_left);
90
91 720 svst2_u8(pg, dst_row + dst_x, svcreate2(dst_left, dst_right));
92 720 }
93 424 };
94
95 808 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
96 const uint8_t *src_row0, const uint8_t *src_row1,
97 uint8_t *dst_row0,
98 uint8_t *dst_row1) KLEIDICV_STREAMING {
99 // Left elements
100 592 svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read/write 1 element
101 {
102 592 svuint8_t s0 = svld1(pg1, src_row0);
103 592 svuint8_t s1 = svld1(pg1, src_row1);
104 592 svst1(pg1, dst_row0, lerp1d_vector(s0, s1));
105 592 svst1(pg1, dst_row1, lerp1d_vector(s1, s0));
106 592 }
107
108 // Middle elements
109
2/2
✓ Branch 0 taken 592 times.
✓ Branch 1 taken 1304 times.
1896 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
110 1304 size_t dst_x = src_x * 2 + 1;
111
112 1304 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
113
114 1304 svuint8_t src_tl = svld1_u8(pg, src_row0 + src_x);
115 1304 svuint8_t src_tr = svld1_u8(pg, src_row0 + src_x + 1);
116 1304 svuint8_t src_bl = svld1_u8(pg, src_row1 + src_x);
117 1304 svuint8_t src_br = svld1_u8(pg, src_row1 + src_x + 1);
118
119 1304 svuint8_t dst_tl = lerp2d_vector(pg, src_tl, src_tr, src_bl, src_br);
120 1304 svuint8_t dst_tr = lerp2d_vector(pg, src_tr, src_tl, src_br, src_bl);
121 1304 svuint8_t dst_bl = lerp2d_vector(pg, src_bl, src_tl, src_br, src_tr);
122 1304 svuint8_t dst_br = lerp2d_vector(pg, src_br, src_tr, src_bl, src_tl);
123
124 1304 svst2_u8(pg, dst_row0 + dst_x, svcreate2(dst_tl, dst_tr));
125 1304 svst2_u8(pg, dst_row1 + dst_x, svcreate2(dst_bl, dst_br));
126 1304 }
127
128 // Right elements
129 592 svuint8_t s0 = svld1(pg1, src_row0 + src_width - 1);
130 592 svuint8_t s1 = svld1(pg1, src_row1 + src_width - 1);
131 592 svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(s0, s1));
132 592 svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(s1, s0));
133 592 };
134
135 // Top row
136
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 212 times.
216 if (KLEIDICV_LIKELY(y_begin == 0)) {
137 212 process_edge_row(src, dst);
138 212 }
139
140 // Middle rows
141
2/2
✓ Branch 0 taken 592 times.
✓ Branch 1 taken 216 times.
808 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
142 592 size_t dst_y = src_y * 2 + 1;
143 592 const uint8_t *src_row0 = src + src_stride * src_y;
144 592 const uint8_t *src_row1 = src_row0 + src_stride;
145 592 uint8_t *dst_row0 = dst + dst_stride * dst_y;
146 592 uint8_t *dst_row1 = dst_row0 + dst_stride;
147
148 592 process_row(src_row0, src_row1, dst_row0, dst_row1);
149 592 }
150
151 // Bottom row
152
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 212 times.
216 if (KLEIDICV_LIKELY(y_end == src_height)) {
153 424 process_edge_row(src + src_stride * (src_height - 1),
154 212 dst + dst_stride * (dst_height - 1));
155 212 }
156
157 216 return KLEIDICV_OK;
158 216 }
159
160 136 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc(
161 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
162 size_t y_begin, size_t y_end, uint8_t *dst,
163 size_t dst_stride) KLEIDICV_STREAMING {
164 136 size_t dst_width = src_width * 4;
165 136 size_t dst_height = src_height * 4;
166
167 6600 auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b)
168 KLEIDICV_STREAMING {
169 // bias
170 6464 svuint16_t top = svdup_u16(4);
171
172 // bias + a * p
173 6464 svuint16_t bot = svmlalb(top, a, p);
174 6464 top = svmlalt(top, a, p);
175
176 // bias + a * p + b * q
177 6464 bot = svmlalb(bot, b, q);
178 6464 top = svmlalt(top, b, q);
179
180 // (bias + a * p + b * q) / 8
181 6464 svuint8_t result = svshrnb(bot, 3ULL);
182 6464 result = svshrnt(result, top, 3ULL);
183 12928 return result;
184 6464 };
185
186 19720 auto lerp2d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b,
187 uint8_t r, svuint8_t c, uint8_t s,
188 svuint8_t d) KLEIDICV_STREAMING {
189 // bias
190 19584 svuint16_t top = svdup_u16(32);
191
192 // bias + a * p
193 19584 svuint16_t bot = svmlalb(top, a, p);
194 19584 top = svmlalt(top, a, p);
195
196 // bias + a * p + b * q
197 19584 bot = svmlalb(bot, b, q);
198 19584 top = svmlalt(top, b, q);
199
200 // bias + a * p + b * q + c * r
201 19584 bot = svmlalb(bot, c, r);
202 19584 top = svmlalt(top, c, r);
203
204 // bias + a * p + b * q + c * r + d * s
205 19584 bot = svmlalb(bot, d, s);
206 19584 top = svmlalt(top, d, s);
207
208 // (bias + a * p + b * q + c * r + d * s) / 64
209 19584 svuint8_t result = svshrnt(svshrnb(bot, 6ULL), top, 6ULL);
210 39168 return result;
211 19584 };
212
213 // Handle top or bottom edge
214 400 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
215 const uint8_t *src_row,
216 uint8_t *dst_row) KLEIDICV_STREAMING {
217 // Left elements
218 264 dst_row[1] = dst_row[0] = src_row[0];
219
220 // Right elements
221 264 dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
222
223 // Middle elements
224
2/2
✓ Branch 0 taken 264 times.
✓ Branch 1 taken 592 times.
856 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
225 592 size_t dst_x = src_x * 4 + 2;
226 592 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
227 592 svuint8_t a = svld1_u8(pg, src_row + src_x);
228 592 svuint8_t b = svld1_u8(pg, src_row + src_x + 1);
229 1184 svst4_u8(pg, dst_row + dst_x,
230 1184 svcreate4(lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b),
231 592 lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)));
232 592 }
233 264 };
234
235 648 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
236 const uint8_t *src_row0, const uint8_t *src_row1,
237 uint8_t *dst_row0, uint8_t *dst_row1,
238 uint8_t *dst_row2,
239 uint8_t *dst_row3) KLEIDICV_STREAMING {
240 // Left elements
241 512 svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read 1 element
242 512 svbool_t pg2 = svptrue_pat_b8(SV_VL2); // write 2 elements
243 {
244 512 svuint8_t s0 = svdup_lane(svld1(pg1, src_row0), 0);
245 512 svuint8_t s1 = svdup_lane(svld1(pg1, src_row1), 0);
246 512 svst1(pg2, dst_row0, lerp1d_vector(7, s0, 1, s1));
247 512 svst1(pg2, dst_row1, lerp1d_vector(5, s0, 3, s1));
248 512 svst1(pg2, dst_row2, lerp1d_vector(3, s0, 5, s1));
249 512 svst1(pg2, dst_row3, lerp1d_vector(1, s0, 7, s1));
250 512 }
251
252 // Middle elements
253
2/2
✓ Branch 0 taken 512 times.
✓ Branch 1 taken 1224 times.
1736 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
254 1224 size_t dst_x = src_x * 4 + 2;
255
256 1224 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
257
258 1224 svuint8_t a = svld1_u8(pg, src_row0 + src_x);
259 1224 svuint8_t b = svld1_u8(pg, src_row0 + src_x + 1);
260 1224 svuint8_t c = svld1_u8(pg, src_row1 + src_x);
261 1224 svuint8_t d = svld1_u8(pg, src_row1 + src_x + 1);
262
263 2448 svst4_u8(pg, dst_row0 + dst_x,
264 2448 (svcreate4(lerp2d_vector(49, a, 7, b, 7, c, 1, d),
265 1224 lerp2d_vector(35, a, 21, b, 5, c, 3, d),
266 1224 lerp2d_vector(21, a, 35, b, 3, c, 5, d),
267 1224 lerp2d_vector(49, b, 7, a, 7, d, 1, c))));
268
269 2448 svst4_u8(pg, dst_row1 + dst_x,
270 2448 (svcreate4(lerp2d_vector(35, a, 5, b, 21, c, 3, d),
271 1224 lerp2d_vector(25, a, 15, b, 15, c, 9, d),
272 1224 lerp2d_vector(15, a, 25, b, 9, c, 15, d),
273 1224 lerp2d_vector(5, a, 35, b, 3, c, 21, d))));
274 2448 svst4_u8(pg, dst_row2 + dst_x,
275 2448 (svcreate4(lerp2d_vector(21, a, 3, b, 35, c, 5, d),
276 1224 lerp2d_vector(15, a, 9, b, 25, c, 15, d),
277 1224 lerp2d_vector(9, a, 15, b, 15, c, 25, d),
278 1224 lerp2d_vector(3, a, 21, b, 5, c, 35, d))));
279 2448 svst4_u8(pg, dst_row3 + dst_x,
280 2448 (svcreate4(lerp2d_vector(49, c, 7, a, 7, d, 1, b),
281 1224 lerp2d_vector(5, a, 3, b, 35, c, 21, d),
282 1224 lerp2d_vector(3, a, 5, b, 21, c, 35, d),
283 1224 lerp2d_vector(49, d, 7, b, 7, c, 1, a))));
284 1224 }
285
286 // Right elements
287 512 svuint8_t s0 = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
288 512 svuint8_t s1 = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
289 512 svst1(pg2, dst_row0 + dst_width - 2, lerp1d_vector(7, s0, 1, s1));
290 512 svst1(pg2, dst_row1 + dst_width - 2, lerp1d_vector(5, s0, 3, s1));
291 512 svst1(pg2, dst_row2 + dst_width - 2, lerp1d_vector(3, s0, 5, s1));
292 512 svst1(pg2, dst_row3 + dst_width - 2, lerp1d_vector(1, s0, 7, s1));
293 512 };
294
295 400 auto copy_dst_row = [src_width](const uint8_t *dst_from,
296 uint8_t *dst_to) KLEIDICV_STREAMING {
297
2/2
✓ Branch 0 taken 264 times.
✓ Branch 1 taken 680 times.
944 for (size_t i = 0; i < src_width; i += svcntb()) {
298 680 svbool_t pg = svwhilelt_b8_u64(i, src_width);
299 680 svst4(pg, dst_to + i * 4, svld4(pg, dst_from + i * 4));
300 680 }
301 264 };
302
303 // Top rows
304
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 132 times.
136 if (KLEIDICV_LIKELY(y_begin == 0)) {
305 132 process_edge_row(src, dst);
306 132 copy_dst_row(dst, dst + dst_stride);
307 132 }
308
309 // Middle rows
310
2/2
✓ Branch 0 taken 512 times.
✓ Branch 1 taken 136 times.
648 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
311 512 size_t dst_y = src_y * 4 + 2;
312 512 const uint8_t *src_row0 = src + src_stride * src_y;
313 512 const uint8_t *src_row1 = src_row0 + src_stride;
314 512 uint8_t *dst_row0 = dst + dst_stride * dst_y;
315 512 uint8_t *dst_row1 = dst_row0 + dst_stride;
316 512 uint8_t *dst_row2 = dst_row1 + dst_stride;
317 512 uint8_t *dst_row3 = dst_row2 + dst_stride;
318
319 512 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
320 512 }
321
322 // Bottom rows
323
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 132 times.
136 if (KLEIDICV_LIKELY(y_end == src_height)) {
324 264 process_edge_row(src + src_stride * (src_height - 1),
325 132 dst + dst_stride * (dst_height - 2));
326 264 copy_dst_row(dst + dst_stride * (dst_height - 2),
327 132 dst + dst_stride * (dst_height - 1));
328 132 }
329
330 136 return KLEIDICV_OK;
331 136 }
332
333 200 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc(
334 const float *src, size_t src_stride, size_t src_width, size_t src_height,
335 size_t y_begin, size_t y_end, float *dst,
336 size_t dst_stride) KLEIDICV_STREAMING {
337 200 size_t dst_width = src_width * 2;
338 200 src_stride /= sizeof(float);
339 200 dst_stride /= sizeof(float);
340
341 6528 auto lerp1d_vector = [](svbool_t pg, svfloat32_t near,
342 svfloat32_t far) KLEIDICV_STREAMING {
343 6328 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.75F), far, 0.25F);
344 };
345
346 15728 auto lerp2d_vector = [](svbool_t pg, svfloat32_t near, svfloat32_t mid_a,
347 svfloat32_t mid_b,
348 svfloat32_t far) KLEIDICV_STREAMING {
349 15528 return svmla_n_f32_x(
350 15528 pg,
351 15528 svmla_n_f32_x(
352 15528 pg,
353 15528 svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.5625F), mid_a, 0.1875F),
354 15528 mid_b, 0.1875F),
355 15528 far, 0.0625F);
356 };
357
358 // Handle top or bottom edge
359 592 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
360 const float *src_row,
361 float *dst_row) KLEIDICV_STREAMING {
362 // Left element
363 392 dst_row[0] = src_row[0];
364
365 // Middle elements
366
2/2
✓ Branch 0 taken 392 times.
✓ Branch 1 taken 1980 times.
2372 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
367 1980 size_t dst_x = src_x * 2 + 1;
368
369 1980 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
370
371 1980 svfloat32_t a = svld1_f32(pg, src_row + src_x);
372 1980 svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
373
374 3960 svst2_f32(pg, dst_row + dst_x,
375 1980 svcreate2(lerp1d_vector(pg, a, b), lerp1d_vector(pg, b, a)));
376 1980 }
377
378 // Right element
379 392 dst_row[dst_width - 1] = src_row[src_width - 1];
380 392 };
381
382 792 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
383 const float *src_row0, const float *src_row1,
384 float *dst_row0, float *dst_row1) KLEIDICV_STREAMING {
385 // Left elements
386 592 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read/write 1 element
387 {
388 592 svfloat32_t s0 = svld1(pg1, src_row0);
389 592 svfloat32_t s1 = svld1(pg1, src_row1);
390 592 svst1(pg1, dst_row0, lerp1d_vector(pg1, s0, s1));
391 592 svst1(pg1, dst_row1, lerp1d_vector(pg1, s1, s0));
392 592 }
393
394 // Middle elements
395
2/2
✓ Branch 0 taken 592 times.
✓ Branch 1 taken 3882 times.
4474 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
396 3882 size_t dst_x = src_x * 2 + 1;
397
398 3882 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
399
400 3882 svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
401 3882 svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
402 3882 svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
403 3882 svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
404
405 7764 svst2_f32(pg, dst_row0 + dst_x,
406 7764 svcreate2(lerp2d_vector(pg, a, b, c, d),
407 3882 lerp2d_vector(pg, b, a, d, c)));
408 7764 svst2_f32(pg, dst_row1 + dst_x,
409 7764 svcreate2(lerp2d_vector(pg, c, a, d, b),
410 3882 lerp2d_vector(pg, d, b, c, a)));
411 3882 }
412
413 // Right elements
414 592 svfloat32_t s0 = svld1(pg1, src_row0 + src_width - 1);
415 592 svfloat32_t s1 = svld1(pg1, src_row1 + src_width - 1);
416 592 svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(pg1, s0, s1));
417 592 svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(pg1, s1, s0));
418 592 };
419
420 // Top row
421
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 196 times.
200 if (KLEIDICV_LIKELY(y_begin == 0)) {
422 196 process_edge_row(src, dst);
423 196 }
424 // Middle rows
425
2/2
✓ Branch 0 taken 592 times.
✓ Branch 1 taken 200 times.
792 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
426 592 size_t dst_y = src_y * 2 + 1;
427 592 const float *src_row0 = src + src_stride * src_y;
428 592 const float *src_row1 = src_row0 + src_stride;
429 592 float *dst_row0 = dst + dst_stride * dst_y;
430 592 float *dst_row1 = dst_row0 + dst_stride;
431
432 592 process_row(src_row0, src_row1, dst_row0, dst_row1);
433 592 }
434
435 // Bottom row
436
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 196 times.
200 if (KLEIDICV_LIKELY(y_end == src_height)) {
437 392 process_edge_row(src + src_stride * (src_height - 1),
438 196 dst + dst_stride * (src_height * 2 - 1));
439 196 }
440
441 200 return KLEIDICV_OK;
442 200 }
443
444 152 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc(
445 const float *src, size_t src_stride, size_t src_width, size_t src_height,
446 size_t y_begin, size_t y_end, float *dst,
447 size_t dst_stride) KLEIDICV_STREAMING {
448 152 size_t dst_width = src_width * 4;
449 152 size_t dst_height = src_height * 4;
450 152 src_stride /= sizeof(float);
451 152 dst_stride /= sizeof(float);
452
453 42584 auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
454 svfloat32_t b) KLEIDICV_STREAMING {
455 42432 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
456 };
457
458 30696 auto lerp2d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
459 svfloat32_t b, float r, svfloat32_t c, float s,
460 svfloat32_t d) KLEIDICV_STREAMING {
461 30544 return svmla_n_f32_x(
462 30544 pg,
463 61088 svmla_n_f32_x(pg, svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q), c,
464 30544 r),
465 30544 d, s);
466 };
467
468 // Handle top or bottom edge
469 448 auto process_edge_row = [src_width, dst_width, dst_stride, lerp1d_vector](
470 const float *src_row,
471 float *dst_row) KLEIDICV_STREAMING {
472 // Left elements
473 296 dst_row[1] = dst_row[0] = dst_row[dst_stride + 1] = dst_row[dst_stride] =
474 296 src_row[0];
475
476 // Right elements
477 296 dst_row[dst_width - 1] = dst_row[dst_width - 2] =
478 296 dst_row[dst_stride + dst_width - 1] =
479 296 dst_row[dst_stride + dst_width - 2] = src_row[src_width - 1];
480
481 // Middle elements
482
2/2
✓ Branch 0 taken 296 times.
✓ Branch 1 taken 1916 times.
2212 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
483 1916 size_t dst_x = src_x * 4 + 2;
484 1916 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
485 1916 svfloat32_t a = svld1_f32(pg, src_row + src_x);
486 1916 svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
487 3832 svfloat32x4_t result = svcreate4(lerp1d_vector(pg, 0.875F, a, 0.125F, b),
488 1916 lerp1d_vector(pg, 0.625F, a, 0.375F, b),
489 1916 lerp1d_vector(pg, 0.375F, a, 0.625F, b),
490 1916 lerp1d_vector(pg, 0.125F, a, 0.875F, b));
491 1916 svst4_f32(pg, dst_row + dst_x, result);
492 1916 svst4_f32(pg, dst_row + dst_stride + dst_x, result);
493 1916 }
494 296 };
495
496 680 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
497 const float *src_row0, const float *src_row1,
498 float *dst_row0, float *dst_row1, float *dst_row2,
499 float *dst_row3) KLEIDICV_STREAMING {
500 // Left elements
501 528 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
502 528 svbool_t pg2 = svptrue_pat_b32(SV_VL2); // write 2 elements
503 528 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
504 528 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
505 528 svst1(pg2, dst_row0, lerp1d_vector(pg2, 0.875F, s0l, 0.125F, s1l));
506 528 svst1(pg2, dst_row1, lerp1d_vector(pg2, 0.625F, s0l, 0.375F, s1l));
507 528 svst1(pg2, dst_row2, lerp1d_vector(pg2, 0.375F, s0l, 0.625F, s1l));
508 528 svst1(pg2, dst_row3, lerp1d_vector(pg2, 0.125F, s0l, 0.875F, s1l));
509
510 // Middle elements
511
2/2
✓ Branch 0 taken 528 times.
✓ Branch 1 taken 3818 times.
4346 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
512 3818 size_t dst_x = src_x * 4 + 2;
513
514 3818 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
515
516 3818 svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
517 3818 svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
518 3818 svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
519 3818 svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
520
521 7636 svfloat32x4_t dst_a =
522 11454 svcreate4(lerp2d_vector(pg, 0.765625F, a, 0.109375F, b, 0.109375F, c,
523 3818 0.015625F, d),
524 7636 lerp2d_vector(pg, 0.546875F, a, 0.328125F, b, 0.078125F, c,
525 3818 0.046875F, d),
526 7636 lerp2d_vector(pg, 0.328125F, a, 0.546875F, b, 0.046875F, c,
527 3818 0.078125F, d),
528 7636 lerp2d_vector(pg, 0.109375F, a, 0.765625F, b, 0.015625F, c,
529 3818 0.109375F, d));
530 7636 svfloat32x4_t dst_d =
531 11454 svcreate4(lerp2d_vector(pg, 0.109375F, a, 0.015625F, b, 0.765625F, c,
532 3818 0.109375F, d),
533 7636 lerp2d_vector(pg, 0.078125F, a, 0.046875F, b, 0.546875F, c,
534 3818 0.328125F, d),
535 7636 lerp2d_vector(pg, 0.046875F, a, 0.078125F, b, 0.328125F, c,
536 3818 0.546875F, d),
537 7636 lerp2d_vector(pg, 0.015625F, a, 0.109375F, b, 0.109375F, c,
538 3818 0.765625F, d));
539 3818 const float one_3rd = 0.3333333333333333F;
540 3818 const float two_3rd = 0.6666666666666667F;
541 3818 svst4_f32(pg, dst_row0 + dst_x, dst_a);
542 7636 svst4_f32(pg, dst_row1 + dst_x,
543 11454 svcreate4(lerp1d_vector(pg, two_3rd, svget4(dst_a, 0), one_3rd,
544 3818 svget4(dst_d, 0)),
545 7636 lerp1d_vector(pg, two_3rd, svget4(dst_a, 1), one_3rd,
546 3818 svget4(dst_d, 1)),
547 7636 lerp1d_vector(pg, two_3rd, svget4(dst_a, 2), one_3rd,
548 3818 svget4(dst_d, 2)),
549 7636 lerp1d_vector(pg, two_3rd, svget4(dst_a, 3), one_3rd,
550 3818 svget4(dst_d, 3))));
551 7636 svst4_f32(pg, dst_row2 + dst_x,
552 11454 svcreate4(lerp1d_vector(pg, one_3rd, svget4(dst_a, 0), two_3rd,
553 3818 svget4(dst_d, 0)),
554 7636 lerp1d_vector(pg, one_3rd, svget4(dst_a, 1), two_3rd,
555 3818 svget4(dst_d, 1)),
556 7636 lerp1d_vector(pg, one_3rd, svget4(dst_a, 2), two_3rd,
557 3818 svget4(dst_d, 2)),
558 7636 lerp1d_vector(pg, one_3rd, svget4(dst_a, 3), two_3rd,
559 3818 svget4(dst_d, 3))));
560 3818 svst4_f32(pg, dst_row3 + dst_x, dst_d);
561 3818 }
562
563 // Right elements
564 528 svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
565 528 svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
566 1056 svst1(pg2, dst_row0 + dst_width - 2,
567 528 lerp1d_vector(pg2, 0.875F, s0r, 0.125F, s1r));
568 1056 svst1(pg2, dst_row1 + dst_width - 2,
569 528 lerp1d_vector(pg2, 0.625F, s0r, 0.375F, s1r));
570 1056 svst1(pg2, dst_row2 + dst_width - 2,
571 528 lerp1d_vector(pg2, 0.375F, s0r, 0.625F, s1r));
572 1056 svst1(pg2, dst_row3 + dst_width - 2,
573 528 lerp1d_vector(pg2, 0.125F, s0r, 0.875F, s1r));
574 528 };
575
576 // Top rows
577
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 148 times.
152 if (KLEIDICV_LIKELY(y_begin == 0)) {
578 148 process_edge_row(src, dst);
579 148 }
580
581 // Middle rows
582
2/2
✓ Branch 0 taken 528 times.
✓ Branch 1 taken 152 times.
680 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
583 528 size_t dst_y = src_y * 4 + 2;
584 528 const float *src_row0 = src + src_stride * src_y;
585 528 const float *src_row1 = src_row0 + src_stride;
586 528 float *dst_row0 = dst + dst_stride * dst_y;
587 528 float *dst_row1 = dst_row0 + dst_stride;
588 528 float *dst_row2 = dst_row1 + dst_stride;
589 528 float *dst_row3 = dst_row2 + dst_stride;
590
591 528 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
592 528 }
593
594 // Bottom rows
595
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 148 times.
152 if (KLEIDICV_LIKELY(y_end == src_height)) {
596 296 process_edge_row(src + src_stride * (src_height - 1),
597 148 dst + dst_stride * (dst_height - 2));
598 148 }
599 152 return KLEIDICV_OK;
600 152 }
601
602 34 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc(
603 const float *src, size_t src_stride, size_t src_width, size_t src_height,
604 size_t y_begin, size_t y_end, float *dst,
605 size_t dst_stride) KLEIDICV_STREAMING {
606 34 size_t dst_width = src_width * 8;
607 34 size_t dst_height = src_height * 8;
608 34 src_stride /= sizeof(float);
609 34 dst_stride /= sizeof(float);
610
611 34 float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0,
612 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0};
613 34 float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0,
614 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0};
615 34 svfloat32_t coeffs_a0 = svld1(svptrue_b32(), &coeffs_a[0]);
616 34 svfloat32_t coeffs_a1 = svld1(svptrue_b32(), &coeffs_a[4]);
617 34 svfloat32_t coeffs_b0 = svld1(svptrue_b32(), &coeffs_b[0]);
618 34 svfloat32_t coeffs_b1 = svld1(svptrue_b32(), &coeffs_b[4]);
619 68 std::reference_wrapper<svfloat32_t> coeffs_ab[4] = {coeffs_a0, coeffs_a1,
620 68 coeffs_b0, coeffs_b1};
621
622 111618 auto lerp1d_vector_n = [](svbool_t pg, float p, svfloat32_t a, float q,
623 svfloat32_t b) KLEIDICV_STREAMING {
624 111584 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
625 };
626
627 8970 auto lerp1d_vector = [](svbool_t pg, svfloat32_t p, svfloat32_t a,
628 svfloat32_t q, svfloat32_t b) KLEIDICV_STREAMING {
629 8936 return svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q);
630 };
631
632 // Handle top or bottom edge
633 34 auto process_edge_row =
634 100 [src_width, dst_width, lerp1d_vector](
635 const float *src_row, float *dst_row, size_t dst_stride,
636 std::reference_wrapper<svfloat32_t> coeffs_ab[4]) KLEIDICV_STREAMING {
637 // Left elements
638 66 float left = src_row[0];
639 66 float *dst = dst_row;
640
2/2
✓ Branch 0 taken 264 times.
✓ Branch 1 taken 66 times.
330 for (size_t i = 0; i < 4; ++i) {
641 264 *dst++ = left;
642 264 *dst++ = left;
643 264 *dst++ = left;
644 264 *dst = left;
645 264 dst += dst_stride - 3;
646 264 }
647
648 // Middle elements
649 66 svfloat32_t a, b = svdup_n_f32(src_row[0]);
650
2/2
✓ Branch 0 taken 4468 times.
✓ Branch 1 taken 66 times.
4534 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
651 4468 a = b;
652 4468 b = svdup_n_f32(src_row[src_x + 1]);
653 4468 float *dst_row0 = dst_row + src_x * 8 + 4;
654 4468 float *dst_row1 = dst_row0 + dst_stride;
655 4468 float *dst_row2 = dst_row1 + dst_stride;
656 4468 float *dst_row3 = dst_row2 + dst_stride;
657 8936 svfloat32_t dst =
658 4468 lerp1d_vector(svptrue_b32(), coeffs_ab[0], a, coeffs_ab[2], b);
659 4468 svst1(svptrue_b32(), dst_row0, dst);
660 4468 svst1(svptrue_b32(), dst_row1, dst);
661 4468 svst1(svptrue_b32(), dst_row2, dst);
662 4468 svst1(svptrue_b32(), dst_row3, dst);
663 4468 dst = lerp1d_vector(svptrue_b32(), coeffs_ab[1], a, coeffs_ab[3], b);
664 4468 svst1(svptrue_b32(), dst_row0 + 4, dst);
665 4468 svst1(svptrue_b32(), dst_row1 + 4, dst);
666 4468 svst1(svptrue_b32(), dst_row2 + 4, dst);
667 4468 svst1(svptrue_b32(), dst_row3 + 4, dst);
668 4468 }
669
670 // Right elements
671 66 dst = dst_row + dst_width - 4;
672 66 float right = src_row[src_width - 1];
673
2/2
✓ Branch 0 taken 66 times.
✓ Branch 1 taken 264 times.
330 for (size_t i = 0; i < 4; ++i) {
674 264 *dst++ = right;
675 264 *dst++ = right;
676 264 *dst++ = right;
677 264 *dst = right;
678 264 dst += dst_stride - 3;
679 264 }
680 66 };
681
682 34 svfloat32_t coeffs_p0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 15.0 / 16);
683 34 svfloat32_t coeffs_q0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 15.0 / 16);
684 34 svfloat32_t coeffs_r0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 1.0 / 16);
685 34 svfloat32_t coeffs_s0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 1.0 / 16);
686 34 svfloat32_t coeffs_p1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 15.0 / 16);
687 34 svfloat32_t coeffs_q1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 15.0 / 16);
688 34 svfloat32_t coeffs_r1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 1.0 / 16);
689 34 svfloat32_t coeffs_s1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 1.0 / 16);
690
691 272 std::reference_wrapper<svfloat32_t> coeffs_pqrs[8] = {
692 136 coeffs_p0, coeffs_p1, coeffs_q0, coeffs_q1,
693 136 coeffs_r0, coeffs_r1, coeffs_s0, coeffs_s1,
694 };
695
696 36546 auto lerp2d_vector = [](svbool_t pg, svfloat32_t a, svfloat32_t p,
697 svfloat32_t b, svfloat32_t q, svfloat32_t c,
698 svfloat32_t r, svfloat32_t d,
699 svfloat32_t s) KLEIDICV_STREAMING {
700 36512 return svmla_f32_x(
701 36512 pg, svmla_f32_x(pg, svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q), c, r),
702 36512 d, s);
703 };
704
705 162 auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n](
706 const float *src_row0, const float *src_row1,
707 float *dst_row0, size_t dst_stride,
708 std::reference_wrapper<svfloat32_t>
709 coeffs_pqrs[8]) KLEIDICV_STREAMING {
710 // Left elements
711 128 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
712 128 svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements
713 128 float *dst_lr = dst_row0;
714 128 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
715 128 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
716
2/2
✓ Branch 0 taken 1024 times.
✓ Branch 1 taken 128 times.
1152 for (size_t i = 0; i < 8; ++i) {
717 2048 svst1(pg4, dst_lr,
718 2048 lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
719 1024 static_cast<float>(i * 2 + 1) / 16.0F, s1l));
720 1024 dst_lr += dst_stride;
721 1024 }
722
723 // Middle elements
724 128 dst_row0 += 4;
725 128 float *dst_row1 = dst_row0 + dst_stride;
726 128 float *dst_row2 = dst_row1 + dst_stride;
727 128 float *dst_row3 = dst_row2 + dst_stride;
728 128 float *dst_row4 = dst_row3 + dst_stride;
729 128 float *dst_row5 = dst_row4 + dst_stride;
730 128 float *dst_row6 = dst_row5 + dst_stride;
731 128 float *dst_row7 = dst_row6 + dst_stride;
732 128 svfloat32_t a, b = s0l;
733 128 svfloat32_t c, d = s1l;
734
2/2
✓ Branch 0 taken 9128 times.
✓ Branch 1 taken 128 times.
9256 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
735 9128 a = b;
736 9128 b = svdup_lane(svld1(pg1, src_row0 + src_x + 1), 0);
737 9128 c = d;
738 9128 d = svdup_lane(svld1(pg1, src_row1 + src_x + 1), 0);
739 18256 svfloat32_t dst_0 =
740 18256 lerp2d_vector(svptrue_b32(), coeffs_pqrs[0], a, coeffs_pqrs[2], b,
741 9128 coeffs_pqrs[4], c, coeffs_pqrs[6], d);
742 9128 svst1(svptrue_b32(), dst_row0, dst_0);
743 18256 svfloat32_t dst_7 =
744 18256 lerp2d_vector(svptrue_b32(), coeffs_pqrs[4], a, coeffs_pqrs[6], b,
745 9128 coeffs_pqrs[0], c, coeffs_pqrs[2], d);
746 9128 svst1(svptrue_b32(), dst_row7, dst_7);
747 18256 svst1(svptrue_b32(), dst_row1,
748 9128 lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
749 18256 svst1(svptrue_b32(), dst_row2,
750 9128 lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
751 18256 svst1(svptrue_b32(), dst_row3,
752 9128 lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
753 18256 svst1(svptrue_b32(), dst_row4,
754 9128 lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
755 18256 svst1(svptrue_b32(), dst_row5,
756 9128 lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
757 18256 svst1(svptrue_b32(), dst_row6,
758 9128 lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
759 9128 dst_row0 += 4;
760 9128 dst_row1 += 4;
761 9128 dst_row2 += 4;
762 9128 dst_row3 += 4;
763 9128 dst_row4 += 4;
764 9128 dst_row5 += 4;
765 9128 dst_row6 += 4;
766 9128 dst_row7 += 4;
767 18256 dst_0 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[1], a, coeffs_pqrs[3], b,
768 9128 coeffs_pqrs[5], c, coeffs_pqrs[7], d);
769 9128 svst1(svptrue_b32(), dst_row0, dst_0);
770 18256 dst_7 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[5], a, coeffs_pqrs[7], b,
771 9128 coeffs_pqrs[1], c, coeffs_pqrs[3], d);
772 9128 svst1(svptrue_b32(), dst_row7, dst_7);
773 18256 svst1(svptrue_b32(), dst_row1,
774 9128 lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
775 18256 svst1(svptrue_b32(), dst_row2,
776 9128 lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
777 18256 svst1(svptrue_b32(), dst_row3,
778 9128 lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
779 18256 svst1(svptrue_b32(), dst_row4,
780 9128 lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
781 18256 svst1(svptrue_b32(), dst_row5,
782 9128 lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
783 18256 svst1(svptrue_b32(), dst_row6,
784 9128 lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
785 9128 dst_row0 += 4;
786 9128 dst_row1 += 4;
787 9128 dst_row2 += 4;
788 9128 dst_row3 += 4;
789 9128 dst_row4 += 4;
790 9128 dst_row5 += 4;
791 9128 dst_row6 += 4;
792 9128 dst_row7 += 4;
793 9128 }
794
795 // Right elements
796 128 dst_lr = dst_row0;
797 128 svfloat32_t s0r = b;
798 128 svfloat32_t s1r = d;
799
2/2
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 1024 times.
1152 for (size_t i = 0; i < 8; ++i) {
800 2048 svst1(pg4, dst_lr,
801 2048 lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
802 1024 static_cast<float>(i * 2 + 1) / 16.0F, s1r));
803 1024 dst_lr += dst_stride;
804 1024 }
805 128 };
806
807 // Top rows
808
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_begin == 0)) {
809 33 process_edge_row(src, dst, dst_stride, coeffs_ab);
810 33 }
811
812 // Middle rows
813
2/2
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
162 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
814 128 size_t dst_y = src_y * 8 + 4;
815 128 const float *src_row0 = src + src_stride * src_y;
816 128 const float *src_row1 = src_row0 + src_stride;
817 256 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
818 128 coeffs_pqrs);
819 128 }
820
821 // Bottom rows
822
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_end == src_height)) {
823 66 process_edge_row(src + src_stride * (src_height - 1),
824 33 dst + dst_stride * (dst_height - 4), dst_stride,
825 33 coeffs_ab);
826 33 }
827
828 34 return KLEIDICV_OK;
829 34 }
830
831 102 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc(
832 const float *src, size_t src_stride, size_t src_width, size_t src_height,
833 size_t y_begin, size_t y_end, float *dst,
834 size_t dst_stride) KLEIDICV_STREAMING {
835 102 size_t dst_width = src_width * 8;
836 102 size_t dst_height = src_height * 8;
837 102 src_stride /= sizeof(float);
838 102 dst_stride /= sizeof(float);
839
840 102 svuint32_t indices_0a, indices_0b, indices_1a, indices_1b, indices_2a,
841 indices_2b, indices_3a, indices_3b;
842 {
843 // indices for row 0
844 102 svuint32_t tmp_2x = svreinterpret_u32_u64(svindex_u64(0, 0x100000001UL));
845 102 svuint32_t tmp_4x = svzip1(tmp_2x, tmp_2x); // 0, 0, 0, 0, 1, 1, 1, 1, ...
846 102 indices_0a = svzip1(tmp_4x, tmp_4x); // 8 times 0, then 8 times 1, ...
847 102 indices_1a = svzip2(tmp_4x, tmp_4x);
848 // next section, e.g. in case of 512-bit regs (=16 x F32), it is 4, 4, 4, 4,
849 // 5, 5, 5, 5, ...
850 102 tmp_4x = svzip2(tmp_2x, tmp_2x);
851 102 indices_2a = svzip1(tmp_4x, tmp_4x);
852 102 indices_3a = svzip2(tmp_4x, tmp_4x);
853
854 // same as above, just all numbers are bigger by one (for row 1)
855 102 tmp_2x = svreinterpret_u32_u64(svindex_u64(0x100000001UL, 0x100000001UL));
856 102 tmp_4x = svzip1(tmp_2x, tmp_2x); // 1, 1, 1, 1, ...
857 102 indices_0b = svzip1(tmp_4x, tmp_4x); // 8 times 1, then 8 times 2, ...
858 102 indices_1b = svzip2(tmp_4x, tmp_4x);
859 // next section, e.g. in case of 512-bit regs (=16 x F32), it is 5, 5, 5, 5,
860 // 6, 6, 6, 6, ...
861 102 tmp_4x = svzip2(tmp_2x, tmp_2x);
862 102 indices_2b = svzip1(tmp_4x, tmp_4x);
863 102 indices_3b = svzip2(tmp_4x, tmp_4x);
864 102 }
865 816 std::reference_wrapper<svuint32_t> indices[8] = {
866 408 indices_0a, indices_0b, indices_1a, indices_1b,
867 408 indices_2a, indices_2b, indices_3a, indices_3b};
868
869 102 svfloat32_t coeffs_a, coeffs_b;
870 {
871 // Prepare 1/16, 3/16, 5/16, ..., 15/16, repeated
872 102 svuint32_t linear = svindex_u32(1, 2);
873 204 svfloat32_t repetitive_float = // mod 16
874 102 svcvt_f32_x(svptrue_b32(), svand_n_u32_m(svptrue_b32(), linear, 0x0F));
875 102 coeffs_b = svdiv_n_f32_x(svptrue_b32(), repetitive_float, 16.0F);
876 102 coeffs_a = svsub_x(svptrue_b32(), svdup_f32(1.0F), coeffs_b);
877 102 }
878 102 std::reference_wrapper<svfloat32_t> coeffs_ab[2] = {coeffs_a, coeffs_b};
879
880 73062 auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
881 svfloat32_t b) KLEIDICV_STREAMING {
882 72960 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
883 };
884
885 5558 auto index_and_lerp1d = [](svbool_t pg, svuint32_t indices_a,
886 svuint32_t indices_b,
887 std::reference_wrapper<svfloat32_t> coeffs_ab[2],
888 svfloat32_t src) KLEIDICV_STREAMING {
889 10912 return svmla_f32_x(pg, svmul_f32_x(pg, svtbl(src, indices_a), coeffs_ab[0]),
890 5456 svtbl(src, indices_b), coeffs_ab[1]);
891 };
892
893 // Handle top or bottom edge
894 102 auto process_edge_row =
895 300 [src_width, dst_width, index_and_lerp1d](
896 const float *src_row, float *dst_row, size_t dst_stride,
897 std::reference_wrapper<svuint32_t> indices[8],
898 std::reference_wrapper<svfloat32_t> coeffs_ab[2]) KLEIDICV_STREAMING {
899 // Left elements
900 198 float left = src_row[0];
901 198 float *dst = dst_row;
902
2/2
✓ Branch 0 taken 792 times.
✓ Branch 1 taken 198 times.
990 for (size_t i = 0; i < 4; ++i) {
903 792 *dst++ = left;
904 792 *dst++ = left;
905 792 *dst++ = left;
906 792 *dst = left;
907 792 dst += dst_stride - 3;
908 792 }
909
910 // Middle elements
911
2/2
✓ Branch 0 taken 1364 times.
✓ Branch 1 taken 198 times.
1562 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
912 1364 svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
913 1364 svfloat32_t svsrc = svld1_f32(pg, src_row + src_x);
914
915 1364 size_t dst_length = 8 * (src_width - src_x - 1);
916 1364 svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
917 1364 svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
918 1364 svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
919 1364 svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
920
921 1364 float *dst_row0 = dst_row + src_x * 8 + 4;
922 1364 float *dst_row1 = dst_row0 + dst_stride;
923 1364 float *dst_row2 = dst_row1 + dst_stride;
924 1364 float *dst_row3 = dst_row2 + dst_stride;
925 2728 svfloat32_t dst =
926 1364 index_and_lerp1d(pg_1, indices[0], indices[1], coeffs_ab, svsrc);
927 1364 svst1(pg_1, dst_row0, dst);
928 1364 svst1(pg_1, dst_row1, dst);
929 1364 svst1(pg_1, dst_row2, dst);
930 1364 svst1(pg_1, dst_row3, dst);
931
932 1364 dst =
933 1364 index_and_lerp1d(pg_2, indices[2], indices[3], coeffs_ab, svsrc);
934 1364 svst1_vnum(pg_2, dst_row0, 1, dst);
935 1364 svst1_vnum(pg_2, dst_row1, 1, dst);
936 1364 svst1_vnum(pg_2, dst_row2, 1, dst);
937 1364 svst1_vnum(pg_2, dst_row3, 1, dst);
938
939 1364 dst =
940 1364 index_and_lerp1d(pg_3, indices[4], indices[5], coeffs_ab, svsrc);
941 1364 svst1_vnum(pg_3, dst_row0, 2, dst);
942 1364 svst1_vnum(pg_3, dst_row1, 2, dst);
943 1364 svst1_vnum(pg_3, dst_row2, 2, dst);
944 1364 svst1_vnum(pg_3, dst_row3, 2, dst);
945
946 1364 dst =
947 1364 index_and_lerp1d(pg_4, indices[6], indices[7], coeffs_ab, svsrc);
948 1364 svst1_vnum(pg_4, dst_row0, 3, dst);
949 1364 svst1_vnum(pg_4, dst_row1, 3, dst);
950 1364 svst1_vnum(pg_4, dst_row2, 3, dst);
951 1364 svst1_vnum(pg_4, dst_row3, 3, dst);
952 1364 }
953
954 // Right elements
955 198 dst = dst_row + dst_width - 4;
956 198 float right = src_row[src_width - 1];
957
2/2
✓ Branch 0 taken 198 times.
✓ Branch 1 taken 792 times.
990 for (size_t i = 0; i < 4; ++i) {
958 792 *dst++ = right;
959 792 *dst++ = right;
960 792 *dst++ = right;
961 792 *dst = right;
962 792 dst += dst_stride - 3;
963 792 }
964 198 };
965
966 102 svfloat32_t coeffs_p = svmul_n_f32_x(svptrue_b32(), coeffs_a, 15.0 / 16);
967 102 svfloat32_t coeffs_q = svmul_n_f32_x(svptrue_b32(), coeffs_b, 15.0 / 16);
968 102 svfloat32_t coeffs_r = svmul_n_f32_x(svptrue_b32(), coeffs_a, 1.0 / 16);
969 102 svfloat32_t coeffs_s = svmul_n_f32_x(svptrue_b32(), coeffs_b, 1.0 / 16);
970 204 std::reference_wrapper<svfloat32_t> coeffs_pqrs[4] = {coeffs_p, coeffs_q,
971 204 coeffs_r, coeffs_s};
972
973 22374 auto index_and_lerp2d = [](svbool_t pg, svuint32_t indices_a,
974 svuint32_t indices_b,
975 std::reference_wrapper<svfloat32_t> coeffs_pqrs[4],
976 svfloat32_t src0,
977 svfloat32_t src1) KLEIDICV_STREAMING {
978 22272 return svmla_f32_x(
979 22272 pg,
980 22272 svmla_f32_x(
981 22272 pg,
982 44544 svmla_f32_x(pg,
983 22272 svmul_f32_x(pg, svtbl(src0, indices_a), coeffs_pqrs[0]),
984 22272 svtbl(src0, indices_b), coeffs_pqrs[1]),
985 22272 svtbl(src1, indices_a), coeffs_pqrs[2]),
986 22272 svtbl(src1, indices_b), coeffs_pqrs[3]);
987 };
988
989 486 auto process_row = [src_width, dst_width, index_and_lerp2d, lerp1d_vector](
990 const float *src_row0, const float *src_row1,
991 float *dst_row, size_t dst_stride,
992 std::reference_wrapper<svuint32_t> indices[8],
993 std::reference_wrapper<svfloat32_t>
994 coeffs_pqrs[4]) KLEIDICV_STREAMING {
995 // Left edge
996 384 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
997 384 svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements
998 384 float *dst_lr = dst_row;
999 384 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
1000 384 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
1001
2/2
✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 384 times.
3456 for (size_t i = 0; i < 8; ++i) {
1002 6144 svst1(pg4, dst_lr,
1003 6144 lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
1004 3072 static_cast<float>(i * 2 + 1) / 16.0F, s1l));
1005 3072 dst_lr += dst_stride;
1006 3072 }
1007
1008 // Middle elements
1009
2/2
✓ Branch 0 taken 2784 times.
✓ Branch 1 taken 384 times.
3168 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
1010 2784 size_t dst_x = src_x * 8 + 4;
1011
1012 2784 svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
1013 2784 svfloat32_t src_0 = svld1_f32(pg, src_row0 + src_x);
1014 2784 svfloat32_t src_1 = svld1_f32(pg, src_row1 + src_x);
1015
1016 2784 size_t dst_length = 8 * (src_width - src_x - 1);
1017 2784 svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
1018 2784 svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
1019 2784 svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
1020 2784 svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
1021
1022 2784 float *dst_row0 = dst_row + dst_x;
1023 2784 float *dst_row1 = dst_row0 + dst_stride;
1024 2784 float *dst_row2 = dst_row1 + dst_stride;
1025 2784 float *dst_row3 = dst_row2 + dst_stride;
1026 2784 float *dst_row4 = dst_row3 + dst_stride;
1027 2784 float *dst_row5 = dst_row4 + dst_stride;
1028 2784 float *dst_row6 = dst_row5 + dst_stride;
1029 2784 float *dst_row7 = dst_row6 + dst_stride;
1030
1031 5568 svfloat32_t dst_0 = index_and_lerp2d(pg_1, indices[0], indices[1],
1032 2784 coeffs_pqrs, src_0, src_1);
1033 2784 svst1(pg_1, dst_row0, dst_0);
1034 5568 svfloat32_t dst_7 = index_and_lerp2d(pg_1, indices[0], indices[1],
1035 2784 coeffs_pqrs, src_1, src_0);
1036 2784 svst1(pg_1, dst_row7, dst_7);
1037 5568 svst1(pg_1, dst_row1,
1038 2784 lerp1d_vector(pg_1, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1039 5568 svst1(pg_1, dst_row2,
1040 2784 lerp1d_vector(pg_1, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1041 5568 svst1(pg_1, dst_row3,
1042 2784 lerp1d_vector(pg_1, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1043 5568 svst1(pg_1, dst_row4,
1044 2784 lerp1d_vector(pg_1, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1045 5568 svst1(pg_1, dst_row5,
1046 2784 lerp1d_vector(pg_1, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1047 5568 svst1(pg_1, dst_row6,
1048 2784 lerp1d_vector(pg_1, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1049
1050 5568 dst_0 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_0,
1051 2784 src_1);
1052 2784 svst1_vnum(pg_2, dst_row0, 1, dst_0);
1053 5568 dst_7 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_1,
1054 2784 src_0);
1055 2784 svst1_vnum(pg_2, dst_row7, 1, dst_7);
1056 5568 svst1_vnum(pg_2, dst_row1, 1,
1057 2784 lerp1d_vector(pg_2, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1058 5568 svst1_vnum(pg_2, dst_row2, 1,
1059 2784 lerp1d_vector(pg_2, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1060 5568 svst1_vnum(pg_2, dst_row3, 1,
1061 2784 lerp1d_vector(pg_2, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1062 5568 svst1_vnum(pg_2, dst_row4, 1,
1063 2784 lerp1d_vector(pg_2, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1064 5568 svst1_vnum(pg_2, dst_row5, 1,
1065 2784 lerp1d_vector(pg_2, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1066 5568 svst1_vnum(pg_2, dst_row6, 1,
1067 2784 lerp1d_vector(pg_2, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1068
1069 5568 dst_0 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_0,
1070 2784 src_1);
1071 2784 svst1_vnum(pg_3, dst_row0, 2, dst_0);
1072 5568 dst_7 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_1,
1073 2784 src_0);
1074 2784 svst1_vnum(pg_3, dst_row7, 2, dst_7);
1075 5568 svst1_vnum(pg_3, dst_row1, 2,
1076 2784 lerp1d_vector(pg_3, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1077 5568 svst1_vnum(pg_3, dst_row2, 2,
1078 2784 lerp1d_vector(pg_3, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1079 5568 svst1_vnum(pg_3, dst_row3, 2,
1080 2784 lerp1d_vector(pg_3, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1081 5568 svst1_vnum(pg_3, dst_row4, 2,
1082 2784 lerp1d_vector(pg_3, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1083 5568 svst1_vnum(pg_3, dst_row5, 2,
1084 2784 lerp1d_vector(pg_3, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1085 5568 svst1_vnum(pg_3, dst_row6, 2,
1086 2784 lerp1d_vector(pg_3, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1087
1088 5568 dst_0 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_0,
1089 2784 src_1);
1090 2784 svst1_vnum(pg_4, dst_row0, 3, dst_0);
1091 5568 dst_7 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_1,
1092 2784 src_0);
1093 2784 svst1_vnum(pg_4, dst_row7, 3, dst_7);
1094 5568 svst1_vnum(pg_4, dst_row1, 3,
1095 2784 lerp1d_vector(pg_4, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1096 5568 svst1_vnum(pg_4, dst_row2, 3,
1097 2784 lerp1d_vector(pg_4, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1098 5568 svst1_vnum(pg_4, dst_row3, 3,
1099 2784 lerp1d_vector(pg_4, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1100 5568 svst1_vnum(pg_4, dst_row4, 3,
1101 2784 lerp1d_vector(pg_4, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1102 5568 svst1_vnum(pg_4, dst_row5, 3,
1103 2784 lerp1d_vector(pg_4, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1104 5568 svst1_vnum(pg_4, dst_row6, 3,
1105 2784 lerp1d_vector(pg_4, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1106 2784 }
1107
1108 // Right edge
1109 384 dst_lr = dst_row;
1110 384 svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
1111 384 svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
1112
2/2
✓ Branch 0 taken 384 times.
✓ Branch 1 taken 3072 times.
3456 for (size_t i = 0; i < 8; ++i) {
1113 6144 svst1(pg4, dst_lr + dst_width - 4,
1114 6144 lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
1115 3072 static_cast<float>(i * 2 + 1) / 16.0F, s1r));
1116 3072 dst_lr += dst_stride;
1117 3072 }
1118 384 };
1119
1120 // Top rows
1121
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.
102 if (KLEIDICV_LIKELY(y_begin == 0)) {
1122 99 process_edge_row(src, dst, dst_stride, indices, coeffs_ab);
1123 99 }
1124
1125 // Middle rows
1126
2/2
✓ Branch 0 taken 384 times.
✓ Branch 1 taken 102 times.
486 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
1127 384 size_t dst_y = src_y * 8 + 4;
1128 384 const float *src_row0 = src + src_stride * src_y;
1129 384 const float *src_row1 = src_row0 + src_stride;
1130 768 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
1131 384 indices, coeffs_pqrs);
1132 384 }
1133
1134 // Bottom rows
1135
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.
102 if (KLEIDICV_LIKELY(y_end == src_height)) {
1136 198 process_edge_row(src + src_stride * (src_height - 1),
1137 99 dst + dst_stride * (dst_height - 4), dst_stride, indices,
1138 99 coeffs_ab);
1139 99 }
1140
1141 102 return KLEIDICV_OK;
1142 102 }
1143
1144 380 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_u8_sc(
1145 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
1146 size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride,
1147 size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
1148
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 376 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 376 times.
380 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1149
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 372 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 372 times.
376 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1150
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 368 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 364 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 364 times.
372 CHECK_IMAGE_SIZE(dst_width, dst_height);
1151
1152
4/4
✓ Branch 0 taken 356 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 352 times.
364 if (src_width == 0 || src_height == 0) {
1153 12 return KLEIDICV_OK;
1154 }
1155
3/4
✓ Branch 0 taken 216 times.
✓ Branch 1 taken 136 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 216 times.
352 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
1156 432 return resize_2x2_u8_sc(src, src_stride, src_width, src_height, y_begin,
1157 216 y_end, dst, dst_stride);
1158 }
1159
2/4
✓ Branch 0 taken 136 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 136 times.
136 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
1160 272 return resize_4x4_u8_sc(src, src_stride, src_width, src_height, y_begin,
1161 136 y_end, dst, dst_stride);
1162 }
1163 // resize_linear_f32_is_implemented checked the kernel size already.
1164 // GCOVR_EXCL_START
1165 assert(!"resize ratio not implemented");
1166 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1167 // GCOVR_EXCL_STOP
1168 380 }
1169
1170 516 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_f32_sc(
1171 const float *src, size_t src_stride, size_t src_width, size_t src_height,
1172 size_t y_begin, size_t y_end, float *dst, size_t dst_stride,
1173 size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
1174
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 512 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 512 times.
516 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1175
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 508 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 508 times.
512 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1176
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 504 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 500 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 500 times.
508 CHECK_IMAGE_SIZE(dst_width, dst_height);
1177
1178
4/4
✓ Branch 0 taken 492 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 488 times.
500 if (src_width == 0 || src_height == 0) {
1179 12 return KLEIDICV_OK;
1180 }
1181
3/4
✓ Branch 0 taken 200 times.
✓ Branch 1 taken 288 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 200 times.
488 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
1182 400 return resize_2x2_f32_sc(src, src_stride, src_width, src_height, y_begin,
1183 200 y_end, dst, dst_stride);
1184 }
1185
3/4
✓ Branch 0 taken 152 times.
✓ Branch 1 taken 136 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 152 times.
288 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
1186 304 return resize_4x4_f32_sc(src, src_stride, src_width, src_height, y_begin,
1187 152 y_end, dst, dst_stride);
1188 }
1189
2/4
✓ Branch 0 taken 136 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 136 times.
136 if (src_width * 8 == dst_width && src_height * 8 == dst_height) {
1190
2/2
✓ Branch 0 taken 102 times.
✓ Branch 1 taken 34 times.
136 if (svcntw() >= 8) {
1191 204 return resize_8x8_f32_sve256plus_sc(src, src_stride, src_width,
1192 102 src_height, y_begin, y_end, dst,
1193 102 dst_stride);
1194 }
1195 68 return resize_8x8_f32_sve128_sc(src, src_stride, src_width, src_height,
1196 34 y_begin, y_end, dst, dst_stride);
1197 }
1198 // resize_linear_f32_is_implemented checked the kernel size already.
1199 // GCOVR_EXCL_START
1200 assert(!"resize ratio not implemented");
1201 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1202 // GCOVR_EXCL_STOP
1203 516 }
1204
1205 } // namespace KLEIDICV_TARGET_NAMESPACE
1206
1207 #endif // KLEIDICV_RESIZE_SC_H
1208