KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/resize/resize_linear_sc.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 842 842 100.0%
Functions: 64 70 91.4%
Branches: 129 136 94.9%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RESIZE_LINEAR_SC_H
6 #define KLEIDICV_RESIZE_LINEAR_SC_H
7
8 #include <cassert>
9
10 #include "kleidicv/kleidicv.h"
11 #include "kleidicv/sve2.h"
12
13 namespace KLEIDICV_TARGET_NAMESPACE {
14
15 162 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc(
16 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
17 size_t y_begin, size_t y_end, uint8_t *dst,
18 size_t dst_stride) KLEIDICV_STREAMING {
19 162 size_t dst_width = src_width * 2;
20 162 size_t dst_height = src_height * 2;
21
22 3098 auto lerp1d_vector = [](svuint8_t near, svuint8_t far) KLEIDICV_STREAMING {
23 // near * 3
24 2936 svuint16_t near3b = svmullb(near, uint8_t{3});
25 2936 svuint16_t near3t = svmullt(near, uint8_t{3});
26
27 // near * 3 + far
28 2936 svuint16_t near3_far_b = svaddwb(near3b, far);
29 2936 svuint16_t near3_far_t = svaddwt(near3t, far);
30
31 // near * 3 + far + 2
32 2936 svuint16_t near3_far_2b = svaddwb(near3_far_b, uint8_t{2});
33 2936 svuint16_t near3_far_2t = svaddwt(near3_far_t, uint8_t{2});
34
35 // (near * 3 + far + 2) / 4
36 2936 svuint8_t near3_far_2_div4 = svshrnb_n_u16(near3_far_2b, 2);
37 2936 near3_far_2_div4 = svshrnt_n_u16(near3_far_2_div4, near3_far_2t, 2);
38 5872 return near3_far_2_div4;
39 2936 };
40
41 4370 auto lerp2d_vector = [](svbool_t pg, svuint8_t near, svuint8_t mid_a,
42 svuint8_t mid_b, svuint8_t far) KLEIDICV_STREAMING {
43 // near * 9
44 4208 svuint16_t near9b = svmullb(near, uint8_t{9});
45 4208 svuint16_t near9t = svmullt(near, uint8_t{9});
46
47 // mid_a + mid_b
48 4208 svuint16_t midb = svaddlb(mid_a, mid_b);
49 4208 svuint16_t midt = svaddlt(mid_a, mid_b);
50
51 // near * 9 + (mid_a + mid_b) * 3
52 4208 svuint16_t near9_mid3b = svmla_x(pg, near9b, midb, uint16_t{3});
53 4208 svuint16_t near9_mid3t = svmla_x(pg, near9t, midt, uint16_t{3});
54
55 // near * 9 + (mid_a + mid_b) * 3 + far
56 4208 svuint16_t near9_mid3_far_b = svaddwb(near9_mid3b, far);
57 4208 svuint16_t near9_mid3_far_t = svaddwt(near9_mid3t, far);
58
59 // near * 9 + (mid_a + mid_b) * 3 + far + 8
60 4208 svuint16_t near9_mid3_far_8b = svaddwb(near9_mid3_far_b, uint8_t{8});
61 4208 svuint16_t near9_mid3_far_8t = svaddwt(near9_mid3_far_t, uint8_t{8});
62
63 // (near * 9 + (mid_a + mid_b) * 3 + far + 8) / 16
64 4208 svuint8_t near9_mid3_far_8_div16 = svshrnb_n_u16(near9_mid3_far_8b, 4);
65 4208 near9_mid3_far_8_div16 =
66 4208 svshrnt_n_u16(near9_mid3_far_8_div16, near9_mid3_far_8t, 4);
67 8416 return near9_mid3_far_8_div16;
68 4208 };
69
70 // Handle top or bottom edge
71 480 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
72 const uint8_t *src_row,
73 uint8_t *dst_row) KLEIDICV_STREAMING {
74 // Left element
75 318 dst_row[0] = src_row[0];
76
77 // Right element
78 318 dst_row[dst_width - 1] = src_row[src_width - 1];
79
80
2/2
✓ Branch 0 taken 318 times.
✓ Branch 1 taken 580 times.
898 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
81 580 size_t dst_x = src_x * 2 + 1;
82
83 580 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
84
85 580 svuint8_t src_left = svld1_u8(pg, src_row + src_x);
86 580 svuint8_t src_right = svld1_u8(pg, src_row + src_x + 1);
87
88 580 svuint8_t dst_left = lerp1d_vector(src_left, src_right);
89 580 svuint8_t dst_right = lerp1d_vector(src_right, src_left);
90
91 580 svst2_u8(pg, dst_row + dst_x, svcreate2(dst_left, dst_right));
92 580 }
93 318 };
94
95 606 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
96 const uint8_t *src_row0, const uint8_t *src_row1,
97 uint8_t *dst_row0,
98 uint8_t *dst_row1) KLEIDICV_STREAMING {
99 // Left elements
100 444 svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read/write 1 element
101 {
102 444 svuint8_t s0 = svld1(pg1, src_row0);
103 444 svuint8_t s1 = svld1(pg1, src_row1);
104 444 svst1(pg1, dst_row0, lerp1d_vector(s0, s1));
105 444 svst1(pg1, dst_row1, lerp1d_vector(s1, s0));
106 444 }
107
108 // Middle elements
109
2/2
✓ Branch 0 taken 444 times.
✓ Branch 1 taken 1052 times.
1496 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
110 1052 size_t dst_x = src_x * 2 + 1;
111
112 1052 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
113
114 1052 svuint8_t src_tl = svld1_u8(pg, src_row0 + src_x);
115 1052 svuint8_t src_tr = svld1_u8(pg, src_row0 + src_x + 1);
116 1052 svuint8_t src_bl = svld1_u8(pg, src_row1 + src_x);
117 1052 svuint8_t src_br = svld1_u8(pg, src_row1 + src_x + 1);
118
119 1052 svuint8_t dst_tl = lerp2d_vector(pg, src_tl, src_tr, src_bl, src_br);
120 1052 svuint8_t dst_tr = lerp2d_vector(pg, src_tr, src_tl, src_br, src_bl);
121 1052 svuint8_t dst_bl = lerp2d_vector(pg, src_bl, src_tl, src_br, src_tr);
122 1052 svuint8_t dst_br = lerp2d_vector(pg, src_br, src_tr, src_bl, src_tl);
123
124 1052 svst2_u8(pg, dst_row0 + dst_x, svcreate2(dst_tl, dst_tr));
125 1052 svst2_u8(pg, dst_row1 + dst_x, svcreate2(dst_bl, dst_br));
126 1052 }
127
128 // Right elements
129 444 svuint8_t s0 = svld1(pg1, src_row0 + src_width - 1);
130 444 svuint8_t s1 = svld1(pg1, src_row1 + src_width - 1);
131 444 svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(s0, s1));
132 444 svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(s1, s0));
133 444 };
134
135 // Top row
136
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 159 times.
162 if (KLEIDICV_LIKELY(y_begin == 0)) {
137 159 process_edge_row(src, dst);
138 159 }
139
140 // Middle rows
141
2/2
✓ Branch 0 taken 444 times.
✓ Branch 1 taken 162 times.
606 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
142 444 size_t dst_y = src_y * 2 + 1;
143 444 const uint8_t *src_row0 = src + src_stride * src_y;
144 444 const uint8_t *src_row1 = src_row0 + src_stride;
145 444 uint8_t *dst_row0 = dst + dst_stride * dst_y;
146 444 uint8_t *dst_row1 = dst_row0 + dst_stride;
147
148 444 process_row(src_row0, src_row1, dst_row0, dst_row1);
149 444 }
150
151 // Bottom row
152
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 159 times.
162 if (KLEIDICV_LIKELY(y_end == src_height)) {
153 318 process_edge_row(src + src_stride * (src_height - 1),
154 159 dst + dst_stride * (dst_height - 1));
155 159 }
156
157 162 return KLEIDICV_OK;
158 162 }
159
160 102 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc(
161 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
162 size_t y_begin, size_t y_end, uint8_t *dst,
163 size_t dst_stride) KLEIDICV_STREAMING {
164 102 size_t dst_width = src_width * 4;
165 102 size_t dst_height = src_height * 4;
166
167 5110 auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b)
168 KLEIDICV_STREAMING {
169 // bias
170 5008 svuint16_t top = svdup_u16(4);
171
172 // bias + a * p
173 5008 svuint16_t bot = svmlalb(top, a, p);
174 5008 top = svmlalt(top, a, p);
175
176 // bias + a * p + b * q
177 5008 bot = svmlalb(bot, b, q);
178 5008 top = svmlalt(top, b, q);
179
180 // (bias + a * p + b * q) / 8
181 5008 svuint8_t result = svshrnb(bot, 3ULL);
182 5008 result = svshrnt(result, top, 3ULL);
183 10016 return result;
184 5008 };
185
186 15974 auto lerp2d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b,
187 uint8_t r, svuint8_t c, uint8_t s,
188 svuint8_t d) KLEIDICV_STREAMING {
189 // bias
190 15872 svuint16_t top = svdup_u16(32);
191
192 // bias + a * p
193 15872 svuint16_t bot = svmlalb(top, a, p);
194 15872 top = svmlalt(top, a, p);
195
196 // bias + a * p + b * q
197 15872 bot = svmlalb(bot, b, q);
198 15872 top = svmlalt(top, b, q);
199
200 // bias + a * p + b * q + c * r
201 15872 bot = svmlalb(bot, c, r);
202 15872 top = svmlalt(top, c, r);
203
204 // bias + a * p + b * q + c * r + d * s
205 15872 bot = svmlalb(bot, d, s);
206 15872 top = svmlalt(top, d, s);
207
208 // (bias + a * p + b * q + c * r + d * s) / 64
209 15872 svuint8_t result = svshrnt(svshrnb(bot, 6ULL), top, 6ULL);
210 31744 return result;
211 15872 };
212
213 // Handle top or bottom edge
214 300 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
215 const uint8_t *src_row,
216 uint8_t *dst_row) KLEIDICV_STREAMING {
217 // Left elements
218 198 dst_row[1] = dst_row[0] = src_row[0];
219
220 // Right elements
221 198 dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
222
223 // Middle elements
224
2/2
✓ Branch 0 taken 198 times.
✓ Branch 1 taken 484 times.
682 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
225 484 size_t dst_x = src_x * 4 + 2;
226 484 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
227 484 svuint8_t a = svld1_u8(pg, src_row + src_x);
228 484 svuint8_t b = svld1_u8(pg, src_row + src_x + 1);
229 968 svst4_u8(pg, dst_row + dst_x,
230 968 svcreate4(lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b),
231 484 lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)));
232 484 }
233 198 };
234
235 486 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
236 const uint8_t *src_row0, const uint8_t *src_row1,
237 uint8_t *dst_row0, uint8_t *dst_row1,
238 uint8_t *dst_row2,
239 uint8_t *dst_row3) KLEIDICV_STREAMING {
240 // Left elements
241 384 svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read 1 element
242 384 svbool_t pg2 = svptrue_pat_b8(SV_VL2); // write 2 elements
243 {
244 384 svuint8_t s0 = svdup_lane(svld1(pg1, src_row0), 0);
245 384 svuint8_t s1 = svdup_lane(svld1(pg1, src_row1), 0);
246 384 svst1(pg2, dst_row0, lerp1d_vector(7, s0, 1, s1));
247 384 svst1(pg2, dst_row1, lerp1d_vector(5, s0, 3, s1));
248 384 svst1(pg2, dst_row2, lerp1d_vector(3, s0, 5, s1));
249 384 svst1(pg2, dst_row3, lerp1d_vector(1, s0, 7, s1));
250 384 }
251
252 // Middle elements
253
2/2
✓ Branch 0 taken 384 times.
✓ Branch 1 taken 992 times.
1376 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
254 992 size_t dst_x = src_x * 4 + 2;
255
256 992 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
257
258 992 svuint8_t a = svld1_u8(pg, src_row0 + src_x);
259 992 svuint8_t b = svld1_u8(pg, src_row0 + src_x + 1);
260 992 svuint8_t c = svld1_u8(pg, src_row1 + src_x);
261 992 svuint8_t d = svld1_u8(pg, src_row1 + src_x + 1);
262
263 1984 svst4_u8(pg, dst_row0 + dst_x,
264 1984 (svcreate4(lerp2d_vector(49, a, 7, b, 7, c, 1, d),
265 992 lerp2d_vector(35, a, 21, b, 5, c, 3, d),
266 992 lerp2d_vector(21, a, 35, b, 3, c, 5, d),
267 992 lerp2d_vector(49, b, 7, a, 7, d, 1, c))));
268
269 1984 svst4_u8(pg, dst_row1 + dst_x,
270 1984 (svcreate4(lerp2d_vector(35, a, 5, b, 21, c, 3, d),
271 992 lerp2d_vector(25, a, 15, b, 15, c, 9, d),
272 992 lerp2d_vector(15, a, 25, b, 9, c, 15, d),
273 992 lerp2d_vector(5, a, 35, b, 3, c, 21, d))));
274 1984 svst4_u8(pg, dst_row2 + dst_x,
275 1984 (svcreate4(lerp2d_vector(21, a, 3, b, 35, c, 5, d),
276 992 lerp2d_vector(15, a, 9, b, 25, c, 15, d),
277 992 lerp2d_vector(9, a, 15, b, 15, c, 25, d),
278 992 lerp2d_vector(3, a, 21, b, 5, c, 35, d))));
279 1984 svst4_u8(pg, dst_row3 + dst_x,
280 1984 (svcreate4(lerp2d_vector(49, c, 7, a, 7, d, 1, b),
281 992 lerp2d_vector(5, a, 3, b, 35, c, 21, d),
282 992 lerp2d_vector(3, a, 5, b, 21, c, 35, d),
283 992 lerp2d_vector(49, d, 7, b, 7, c, 1, a))));
284 992 }
285
286 // Right elements
287 384 svuint8_t s0 = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
288 384 svuint8_t s1 = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
289 384 svst1(pg2, dst_row0 + dst_width - 2, lerp1d_vector(7, s0, 1, s1));
290 384 svst1(pg2, dst_row1 + dst_width - 2, lerp1d_vector(5, s0, 3, s1));
291 384 svst1(pg2, dst_row2 + dst_width - 2, lerp1d_vector(3, s0, 5, s1));
292 384 svst1(pg2, dst_row3 + dst_width - 2, lerp1d_vector(1, s0, 7, s1));
293 384 };
294
295 300 auto copy_dst_row = [src_width](const uint8_t *dst_from,
296 uint8_t *dst_to) KLEIDICV_STREAMING {
297
2/2
✓ Branch 0 taken 198 times.
✓ Branch 1 taken 550 times.
748 for (size_t i = 0; i < src_width; i += svcntb()) {
298 550 svbool_t pg = svwhilelt_b8_u64(i, src_width);
299 550 svst4(pg, dst_to + i * 4, svld4(pg, dst_from + i * 4));
300 550 }
301 198 };
302
303 // Top rows
304
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.
102 if (KLEIDICV_LIKELY(y_begin == 0)) {
305 99 process_edge_row(src, dst);
306 99 copy_dst_row(dst, dst + dst_stride);
307 99 }
308
309 // Middle rows
310
2/2
✓ Branch 0 taken 384 times.
✓ Branch 1 taken 102 times.
486 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
311 384 size_t dst_y = src_y * 4 + 2;
312 384 const uint8_t *src_row0 = src + src_stride * src_y;
313 384 const uint8_t *src_row1 = src_row0 + src_stride;
314 384 uint8_t *dst_row0 = dst + dst_stride * dst_y;
315 384 uint8_t *dst_row1 = dst_row0 + dst_stride;
316 384 uint8_t *dst_row2 = dst_row1 + dst_stride;
317 384 uint8_t *dst_row3 = dst_row2 + dst_stride;
318
319 384 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
320 384 }
321
322 // Bottom rows
323
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.
102 if (KLEIDICV_LIKELY(y_end == src_height)) {
324 198 process_edge_row(src + src_stride * (src_height - 1),
325 99 dst + dst_stride * (dst_height - 2));
326 198 copy_dst_row(dst + dst_stride * (dst_height - 2),
327 99 dst + dst_stride * (dst_height - 1));
328 99 }
329
330 102 return KLEIDICV_OK;
331 102 }
332
333 150 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc(
334 const float *src, size_t src_stride, size_t src_width, size_t src_height,
335 size_t y_begin, size_t y_end, float *dst,
336 size_t dst_stride) KLEIDICV_STREAMING {
337 150 size_t dst_width = src_width * 2;
338 150 src_stride /= sizeof(float);
339 150 dst_stride /= sizeof(float);
340
341 5206 auto lerp1d_vector = [](svbool_t pg, svfloat32_t near,
342 svfloat32_t far) KLEIDICV_STREAMING {
343 5056 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.75F), far, 0.25F);
344 };
345
346 13102 auto lerp2d_vector = [](svbool_t pg, svfloat32_t near, svfloat32_t mid_a,
347 svfloat32_t mid_b,
348 svfloat32_t far) KLEIDICV_STREAMING {
349 12952 return svmla_n_f32_x(
350 12952 pg,
351 12952 svmla_n_f32_x(
352 12952 pg,
353 12952 svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.5625F), mid_a, 0.1875F),
354 12952 mid_b, 0.1875F),
355 12952 far, 0.0625F);
356 };
357
358 // Handle top or bottom edge
359 444 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
360 const float *src_row,
361 float *dst_row) KLEIDICV_STREAMING {
362 // Left element
363 294 dst_row[0] = src_row[0];
364
365 // Middle elements
366
2/2
✓ Branch 0 taken 294 times.
✓ Branch 1 taken 1640 times.
1934 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
367 1640 size_t dst_x = src_x * 2 + 1;
368
369 1640 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
370
371 1640 svfloat32_t a = svld1_f32(pg, src_row + src_x);
372 1640 svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
373
374 3280 svst2_f32(pg, dst_row + dst_x,
375 1640 svcreate2(lerp1d_vector(pg, a, b), lerp1d_vector(pg, b, a)));
376 1640 }
377
378 // Right element
379 294 dst_row[dst_width - 1] = src_row[src_width - 1];
380 294 };
381
382 594 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
383 const float *src_row0, const float *src_row1,
384 float *dst_row0, float *dst_row1) KLEIDICV_STREAMING {
385 // Left elements
386 444 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read/write 1 element
387 {
388 444 svfloat32_t s0 = svld1(pg1, src_row0);
389 444 svfloat32_t s1 = svld1(pg1, src_row1);
390 444 svst1(pg1, dst_row0, lerp1d_vector(pg1, s0, s1));
391 444 svst1(pg1, dst_row1, lerp1d_vector(pg1, s1, s0));
392 444 }
393
394 // Middle elements
395
2/2
✓ Branch 0 taken 444 times.
✓ Branch 1 taken 3238 times.
3682 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
396 3238 size_t dst_x = src_x * 2 + 1;
397
398 3238 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
399
400 3238 svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
401 3238 svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
402 3238 svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
403 3238 svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
404
405 6476 svst2_f32(pg, dst_row0 + dst_x,
406 6476 svcreate2(lerp2d_vector(pg, a, b, c, d),
407 3238 lerp2d_vector(pg, b, a, d, c)));
408 6476 svst2_f32(pg, dst_row1 + dst_x,
409 6476 svcreate2(lerp2d_vector(pg, c, a, d, b),
410 3238 lerp2d_vector(pg, d, b, c, a)));
411 3238 }
412
413 // Right elements
414 444 svfloat32_t s0 = svld1(pg1, src_row0 + src_width - 1);
415 444 svfloat32_t s1 = svld1(pg1, src_row1 + src_width - 1);
416 444 svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(pg1, s0, s1));
417 444 svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(pg1, s1, s0));
418 444 };
419
420 // Top row
421
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 147 times.
150 if (KLEIDICV_LIKELY(y_begin == 0)) {
422 147 process_edge_row(src, dst);
423 147 }
424 // Middle rows
425
2/2
✓ Branch 0 taken 444 times.
✓ Branch 1 taken 150 times.
594 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
426 444 size_t dst_y = src_y * 2 + 1;
427 444 const float *src_row0 = src + src_stride * src_y;
428 444 const float *src_row1 = src_row0 + src_stride;
429 444 float *dst_row0 = dst + dst_stride * dst_y;
430 444 float *dst_row1 = dst_row0 + dst_stride;
431
432 444 process_row(src_row0, src_row1, dst_row0, dst_row1);
433 444 }
434
435 // Bottom row
436
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 147 times.
150 if (KLEIDICV_LIKELY(y_end == src_height)) {
437 294 process_edge_row(src + src_stride * (src_height - 1),
438 147 dst + dst_stride * (src_height * 2 - 1));
439 147 }
440
441 150 return KLEIDICV_OK;
442 150 }
443
444 114 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc(
445 const float *src, size_t src_stride, size_t src_width, size_t src_height,
446 size_t y_begin, size_t y_end, float *dst,
447 size_t dst_stride) KLEIDICV_STREAMING {
448 114 size_t dst_width = src_width * 4;
449 114 size_t dst_height = src_height * 4;
450 114 src_stride /= sizeof(float);
451 114 dst_stride /= sizeof(float);
452
453 35170 auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
454 svfloat32_t b) KLEIDICV_STREAMING {
455 35056 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
456 };
457
458 25634 auto lerp2d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
459 svfloat32_t b, float r, svfloat32_t c, float s,
460 svfloat32_t d) KLEIDICV_STREAMING {
461 25520 return svmla_n_f32_x(
462 25520 pg,
463 51040 svmla_n_f32_x(pg, svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q), c,
464 25520 r),
465 25520 d, s);
466 };
467
468 // Handle top or bottom edge
469 336 auto process_edge_row = [src_width, dst_width, dst_stride, lerp1d_vector](
470 const float *src_row,
471 float *dst_row) KLEIDICV_STREAMING {
472 // Left elements
473 222 dst_row[1] = dst_row[0] = dst_row[dst_stride + 1] = dst_row[dst_stride] =
474 222 src_row[0];
475
476 // Right elements
477 222 dst_row[dst_width - 1] = dst_row[dst_width - 2] =
478 222 dst_row[dst_stride + dst_width - 1] =
479 222 dst_row[dst_stride + dst_width - 2] = src_row[src_width - 1];
480
481 // Middle elements
482
2/2
✓ Branch 0 taken 222 times.
✓ Branch 1 taken 1592 times.
1814 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
483 1592 size_t dst_x = src_x * 4 + 2;
484 1592 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
485 1592 svfloat32_t a = svld1_f32(pg, src_row + src_x);
486 1592 svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
487 3184 svfloat32x4_t result = svcreate4(lerp1d_vector(pg, 0.875F, a, 0.125F, b),
488 1592 lerp1d_vector(pg, 0.625F, a, 0.375F, b),
489 1592 lerp1d_vector(pg, 0.375F, a, 0.625F, b),
490 1592 lerp1d_vector(pg, 0.125F, a, 0.875F, b));
491 1592 svst4_f32(pg, dst_row + dst_x, result);
492 1592 svst4_f32(pg, dst_row + dst_stride + dst_x, result);
493 1592 }
494 222 };
495
496 510 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
497 const float *src_row0, const float *src_row1,
498 float *dst_row0, float *dst_row1, float *dst_row2,
499 float *dst_row3) KLEIDICV_STREAMING {
500 // Left elements
501 396 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
502 396 svbool_t pg2 = svptrue_pat_b32(SV_VL2); // write 2 elements
503 396 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
504 396 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
505 396 svst1(pg2, dst_row0, lerp1d_vector(pg2, 0.875F, s0l, 0.125F, s1l));
506 396 svst1(pg2, dst_row1, lerp1d_vector(pg2, 0.625F, s0l, 0.375F, s1l));
507 396 svst1(pg2, dst_row2, lerp1d_vector(pg2, 0.375F, s0l, 0.625F, s1l));
508 396 svst1(pg2, dst_row3, lerp1d_vector(pg2, 0.125F, s0l, 0.875F, s1l));
509
510 // Middle elements
511
2/2
✓ Branch 0 taken 396 times.
✓ Branch 1 taken 3190 times.
3586 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
512 3190 size_t dst_x = src_x * 4 + 2;
513
514 3190 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
515
516 3190 svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
517 3190 svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
518 3190 svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
519 3190 svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
520
521 6380 svfloat32x4_t dst_a =
522 9570 svcreate4(lerp2d_vector(pg, 0.765625F, a, 0.109375F, b, 0.109375F, c,
523 3190 0.015625F, d),
524 6380 lerp2d_vector(pg, 0.546875F, a, 0.328125F, b, 0.078125F, c,
525 3190 0.046875F, d),
526 6380 lerp2d_vector(pg, 0.328125F, a, 0.546875F, b, 0.046875F, c,
527 3190 0.078125F, d),
528 6380 lerp2d_vector(pg, 0.109375F, a, 0.765625F, b, 0.015625F, c,
529 3190 0.109375F, d));
530 6380 svfloat32x4_t dst_d =
531 9570 svcreate4(lerp2d_vector(pg, 0.109375F, a, 0.015625F, b, 0.765625F, c,
532 3190 0.109375F, d),
533 6380 lerp2d_vector(pg, 0.078125F, a, 0.046875F, b, 0.546875F, c,
534 3190 0.328125F, d),
535 6380 lerp2d_vector(pg, 0.046875F, a, 0.078125F, b, 0.328125F, c,
536 3190 0.546875F, d),
537 6380 lerp2d_vector(pg, 0.015625F, a, 0.109375F, b, 0.109375F, c,
538 3190 0.765625F, d));
539 3190 const float one_3rd = 0.3333333333333333F;
540 3190 const float two_3rd = 0.6666666666666667F;
541 3190 svst4_f32(pg, dst_row0 + dst_x, dst_a);
542 6380 svst4_f32(pg, dst_row1 + dst_x,
543 9570 svcreate4(lerp1d_vector(pg, two_3rd, svget4(dst_a, 0), one_3rd,
544 3190 svget4(dst_d, 0)),
545 6380 lerp1d_vector(pg, two_3rd, svget4(dst_a, 1), one_3rd,
546 3190 svget4(dst_d, 1)),
547 6380 lerp1d_vector(pg, two_3rd, svget4(dst_a, 2), one_3rd,
548 3190 svget4(dst_d, 2)),
549 6380 lerp1d_vector(pg, two_3rd, svget4(dst_a, 3), one_3rd,
550 3190 svget4(dst_d, 3))));
551 6380 svst4_f32(pg, dst_row2 + dst_x,
552 9570 svcreate4(lerp1d_vector(pg, one_3rd, svget4(dst_a, 0), two_3rd,
553 3190 svget4(dst_d, 0)),
554 6380 lerp1d_vector(pg, one_3rd, svget4(dst_a, 1), two_3rd,
555 3190 svget4(dst_d, 1)),
556 6380 lerp1d_vector(pg, one_3rd, svget4(dst_a, 2), two_3rd,
557 3190 svget4(dst_d, 2)),
558 6380 lerp1d_vector(pg, one_3rd, svget4(dst_a, 3), two_3rd,
559 3190 svget4(dst_d, 3))));
560 3190 svst4_f32(pg, dst_row3 + dst_x, dst_d);
561 3190 }
562
563 // Right elements
564 396 svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
565 396 svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
566 792 svst1(pg2, dst_row0 + dst_width - 2,
567 396 lerp1d_vector(pg2, 0.875F, s0r, 0.125F, s1r));
568 792 svst1(pg2, dst_row1 + dst_width - 2,
569 396 lerp1d_vector(pg2, 0.625F, s0r, 0.375F, s1r));
570 792 svst1(pg2, dst_row2 + dst_width - 2,
571 396 lerp1d_vector(pg2, 0.375F, s0r, 0.625F, s1r));
572 792 svst1(pg2, dst_row3 + dst_width - 2,
573 396 lerp1d_vector(pg2, 0.125F, s0r, 0.875F, s1r));
574 396 };
575
576 // Top rows
577
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 111 times.
114 if (KLEIDICV_LIKELY(y_begin == 0)) {
578 111 process_edge_row(src, dst);
579 111 }
580
581 // Middle rows
582
2/2
✓ Branch 0 taken 396 times.
✓ Branch 1 taken 114 times.
510 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
583 396 size_t dst_y = src_y * 4 + 2;
584 396 const float *src_row0 = src + src_stride * src_y;
585 396 const float *src_row1 = src_row0 + src_stride;
586 396 float *dst_row0 = dst + dst_stride * dst_y;
587 396 float *dst_row1 = dst_row0 + dst_stride;
588 396 float *dst_row2 = dst_row1 + dst_stride;
589 396 float *dst_row3 = dst_row2 + dst_stride;
590
591 396 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
592 396 }
593
594 // Bottom rows
595
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 111 times.
114 if (KLEIDICV_LIKELY(y_end == src_height)) {
596 222 process_edge_row(src + src_stride * (src_height - 1),
597 111 dst + dst_stride * (dst_height - 2));
598 111 }
599 114 return KLEIDICV_OK;
600 114 }
601
602 34 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc(
603 const float *src, size_t src_stride, size_t src_width, size_t src_height,
604 size_t y_begin, size_t y_end, float *dst,
605 size_t dst_stride) KLEIDICV_STREAMING {
606 34 size_t dst_width = src_width * 8;
607 34 size_t dst_height = src_height * 8;
608 34 src_stride /= sizeof(float);
609 34 dst_stride /= sizeof(float);
610
611 34 float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0,
612 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0};
613 34 float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0,
614 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0};
615 34 svfloat32_t coeffs_a0 = svld1(svptrue_b32(), &coeffs_a[0]);
616 34 svfloat32_t coeffs_a1 = svld1(svptrue_b32(), &coeffs_a[4]);
617 34 svfloat32_t coeffs_b0 = svld1(svptrue_b32(), &coeffs_b[0]);
618 34 svfloat32_t coeffs_b1 = svld1(svptrue_b32(), &coeffs_b[4]);
619 68 std::reference_wrapper<svfloat32_t> coeffs_ab[4] = {coeffs_a0, coeffs_a1,
620 68 coeffs_b0, coeffs_b1};
621
622 111618 auto lerp1d_vector_n = [](svbool_t pg, float p, svfloat32_t a, float q,
623 svfloat32_t b) KLEIDICV_STREAMING {
624 111584 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
625 };
626
627 8970 auto lerp1d_vector = [](svbool_t pg, svfloat32_t p, svfloat32_t a,
628 svfloat32_t q, svfloat32_t b) KLEIDICV_STREAMING {
629 8936 return svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q);
630 };
631
632 // Handle top or bottom edge
633 34 auto process_edge_row =
634 100 [src_width, dst_width, lerp1d_vector](
635 const float *src_row, float *dst_row, size_t dst_stride,
636 std::reference_wrapper<svfloat32_t> coeffs_ab[4]) KLEIDICV_STREAMING {
637 // Left elements
638 66 float left = src_row[0];
639 66 float *dst = dst_row;
640
2/2
✓ Branch 0 taken 264 times.
✓ Branch 1 taken 66 times.
330 for (size_t i = 0; i < 4; ++i) {
641 264 *dst++ = left;
642 264 *dst++ = left;
643 264 *dst++ = left;
644 264 *dst = left;
645 264 dst += dst_stride - 3;
646 264 }
647
648 // Middle elements
649 66 svfloat32_t a, b = svdup_n_f32(src_row[0]);
650
2/2
✓ Branch 0 taken 4468 times.
✓ Branch 1 taken 66 times.
4534 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
651 4468 a = b;
652 4468 b = svdup_n_f32(src_row[src_x + 1]);
653 4468 float *dst_row0 = dst_row + src_x * 8 + 4;
654 4468 float *dst_row1 = dst_row0 + dst_stride;
655 4468 float *dst_row2 = dst_row1 + dst_stride;
656 4468 float *dst_row3 = dst_row2 + dst_stride;
657 8936 svfloat32_t dst =
658 4468 lerp1d_vector(svptrue_b32(), coeffs_ab[0], a, coeffs_ab[2], b);
659 4468 svst1(svptrue_b32(), dst_row0, dst);
660 4468 svst1(svptrue_b32(), dst_row1, dst);
661 4468 svst1(svptrue_b32(), dst_row2, dst);
662 4468 svst1(svptrue_b32(), dst_row3, dst);
663 4468 dst = lerp1d_vector(svptrue_b32(), coeffs_ab[1], a, coeffs_ab[3], b);
664 4468 svst1(svptrue_b32(), dst_row0 + 4, dst);
665 4468 svst1(svptrue_b32(), dst_row1 + 4, dst);
666 4468 svst1(svptrue_b32(), dst_row2 + 4, dst);
667 4468 svst1(svptrue_b32(), dst_row3 + 4, dst);
668 4468 }
669
670 // Right elements
671 66 dst = dst_row + dst_width - 4;
672 66 float right = src_row[src_width - 1];
673
2/2
✓ Branch 0 taken 66 times.
✓ Branch 1 taken 264 times.
330 for (size_t i = 0; i < 4; ++i) {
674 264 *dst++ = right;
675 264 *dst++ = right;
676 264 *dst++ = right;
677 264 *dst = right;
678 264 dst += dst_stride - 3;
679 264 }
680 66 };
681
682 34 svfloat32_t coeffs_p0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 15.0 / 16);
683 34 svfloat32_t coeffs_q0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 15.0 / 16);
684 34 svfloat32_t coeffs_r0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 1.0 / 16);
685 34 svfloat32_t coeffs_s0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 1.0 / 16);
686 34 svfloat32_t coeffs_p1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 15.0 / 16);
687 34 svfloat32_t coeffs_q1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 15.0 / 16);
688 34 svfloat32_t coeffs_r1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 1.0 / 16);
689 34 svfloat32_t coeffs_s1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 1.0 / 16);
690
691 272 std::reference_wrapper<svfloat32_t> coeffs_pqrs[8] = {
692 136 coeffs_p0, coeffs_p1, coeffs_q0, coeffs_q1,
693 136 coeffs_r0, coeffs_r1, coeffs_s0, coeffs_s1,
694 };
695
696 36546 auto lerp2d_vector = [](svbool_t pg, svfloat32_t a, svfloat32_t p,
697 svfloat32_t b, svfloat32_t q, svfloat32_t c,
698 svfloat32_t r, svfloat32_t d,
699 svfloat32_t s) KLEIDICV_STREAMING {
700 36512 return svmla_f32_x(
701 36512 pg, svmla_f32_x(pg, svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q), c, r),
702 36512 d, s);
703 };
704
705 162 auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n](
706 const float *src_row0, const float *src_row1,
707 float *dst_row0, size_t dst_stride,
708 std::reference_wrapper<svfloat32_t>
709 coeffs_pqrs[8]) KLEIDICV_STREAMING {
710 // Left elements
711 128 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
712 128 svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements
713 128 float *dst_lr = dst_row0;
714 128 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
715 128 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
716
2/2
✓ Branch 0 taken 1024 times.
✓ Branch 1 taken 128 times.
1152 for (size_t i = 0; i < 8; ++i) {
717 2048 svst1(pg4, dst_lr,
718 2048 lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
719 1024 static_cast<float>(i * 2 + 1) / 16.0F, s1l));
720 1024 dst_lr += dst_stride;
721 1024 }
722
723 // Middle elements
724 128 dst_row0 += 4;
725 128 float *dst_row1 = dst_row0 + dst_stride;
726 128 float *dst_row2 = dst_row1 + dst_stride;
727 128 float *dst_row3 = dst_row2 + dst_stride;
728 128 float *dst_row4 = dst_row3 + dst_stride;
729 128 float *dst_row5 = dst_row4 + dst_stride;
730 128 float *dst_row6 = dst_row5 + dst_stride;
731 128 float *dst_row7 = dst_row6 + dst_stride;
732 128 svfloat32_t a, b = s0l;
733 128 svfloat32_t c, d = s1l;
734
2/2
✓ Branch 0 taken 9128 times.
✓ Branch 1 taken 128 times.
9256 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
735 9128 a = b;
736 9128 b = svdup_lane(svld1(pg1, src_row0 + src_x + 1), 0);
737 9128 c = d;
738 9128 d = svdup_lane(svld1(pg1, src_row1 + src_x + 1), 0);
739 18256 svfloat32_t dst_0 =
740 18256 lerp2d_vector(svptrue_b32(), coeffs_pqrs[0], a, coeffs_pqrs[2], b,
741 9128 coeffs_pqrs[4], c, coeffs_pqrs[6], d);
742 9128 svst1(svptrue_b32(), dst_row0, dst_0);
743 18256 svfloat32_t dst_7 =
744 18256 lerp2d_vector(svptrue_b32(), coeffs_pqrs[4], a, coeffs_pqrs[6], b,
745 9128 coeffs_pqrs[0], c, coeffs_pqrs[2], d);
746 9128 svst1(svptrue_b32(), dst_row7, dst_7);
747 18256 svst1(svptrue_b32(), dst_row1,
748 9128 lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
749 18256 svst1(svptrue_b32(), dst_row2,
750 9128 lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
751 18256 svst1(svptrue_b32(), dst_row3,
752 9128 lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
753 18256 svst1(svptrue_b32(), dst_row4,
754 9128 lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
755 18256 svst1(svptrue_b32(), dst_row5,
756 9128 lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
757 18256 svst1(svptrue_b32(), dst_row6,
758 9128 lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
759 9128 dst_row0 += 4;
760 9128 dst_row1 += 4;
761 9128 dst_row2 += 4;
762 9128 dst_row3 += 4;
763 9128 dst_row4 += 4;
764 9128 dst_row5 += 4;
765 9128 dst_row6 += 4;
766 9128 dst_row7 += 4;
767 18256 dst_0 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[1], a, coeffs_pqrs[3], b,
768 9128 coeffs_pqrs[5], c, coeffs_pqrs[7], d);
769 9128 svst1(svptrue_b32(), dst_row0, dst_0);
770 18256 dst_7 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[5], a, coeffs_pqrs[7], b,
771 9128 coeffs_pqrs[1], c, coeffs_pqrs[3], d);
772 9128 svst1(svptrue_b32(), dst_row7, dst_7);
773 18256 svst1(svptrue_b32(), dst_row1,
774 9128 lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
775 18256 svst1(svptrue_b32(), dst_row2,
776 9128 lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
777 18256 svst1(svptrue_b32(), dst_row3,
778 9128 lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
779 18256 svst1(svptrue_b32(), dst_row4,
780 9128 lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
781 18256 svst1(svptrue_b32(), dst_row5,
782 9128 lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
783 18256 svst1(svptrue_b32(), dst_row6,
784 9128 lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
785 9128 dst_row0 += 4;
786 9128 dst_row1 += 4;
787 9128 dst_row2 += 4;
788 9128 dst_row3 += 4;
789 9128 dst_row4 += 4;
790 9128 dst_row5 += 4;
791 9128 dst_row6 += 4;
792 9128 dst_row7 += 4;
793 9128 }
794
795 // Right elements
796 128 dst_lr = dst_row0;
797 128 svfloat32_t s0r = b;
798 128 svfloat32_t s1r = d;
799
2/2
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 1024 times.
1152 for (size_t i = 0; i < 8; ++i) {
800 2048 svst1(pg4, dst_lr,
801 2048 lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
802 1024 static_cast<float>(i * 2 + 1) / 16.0F, s1r));
803 1024 dst_lr += dst_stride;
804 1024 }
805 128 };
806
807 // Top rows
808
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_begin == 0)) {
809 33 process_edge_row(src, dst, dst_stride, coeffs_ab);
810 33 }
811
812 // Middle rows
813
2/2
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
162 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
814 128 size_t dst_y = src_y * 8 + 4;
815 128 const float *src_row0 = src + src_stride * src_y;
816 128 const float *src_row1 = src_row0 + src_stride;
817 256 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
818 128 coeffs_pqrs);
819 128 }
820
821 // Bottom rows
822
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
34 if (KLEIDICV_LIKELY(y_end == src_height)) {
823 66 process_edge_row(src + src_stride * (src_height - 1),
824 33 dst + dst_stride * (dst_height - 4), dst_stride,
825 33 coeffs_ab);
826 33 }
827
828 34 return KLEIDICV_OK;
829 34 }
830
831 68 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc(
832 const float *src, size_t src_stride, size_t src_width, size_t src_height,
833 size_t y_begin, size_t y_end, float *dst,
834 size_t dst_stride) KLEIDICV_STREAMING {
835 68 size_t dst_width = src_width * 8;
836 68 size_t dst_height = src_height * 8;
837 68 src_stride /= sizeof(float);
838 68 dst_stride /= sizeof(float);
839
840 68 svuint32_t indices_0a, indices_0b, indices_1a, indices_1b, indices_2a,
841 indices_2b, indices_3a, indices_3b;
842 {
843 // indices for row 0
844 68 svuint32_t tmp_2x = svreinterpret_u32_u64(svindex_u64(0, 0x100000001UL));
845 68 svuint32_t tmp_4x = svzip1(tmp_2x, tmp_2x); // 0, 0, 0, 0, 1, 1, 1, 1, ...
846 68 indices_0a = svzip1(tmp_4x, tmp_4x); // 8 times 0, then 8 times 1, ...
847 68 indices_1a = svzip2(tmp_4x, tmp_4x);
848 // next section, e.g. in case of 512-bit regs (=16 x F32), it is 4, 4, 4, 4,
849 // 5, 5, 5, 5, ...
850 68 tmp_4x = svzip2(tmp_2x, tmp_2x);
851 68 indices_2a = svzip1(tmp_4x, tmp_4x);
852 68 indices_3a = svzip2(tmp_4x, tmp_4x);
853
854 // same as above, just all numbers are bigger by one (for row 1)
855 68 tmp_2x = svreinterpret_u32_u64(svindex_u64(0x100000001UL, 0x100000001UL));
856 68 tmp_4x = svzip1(tmp_2x, tmp_2x); // 1, 1, 1, 1, ...
857 68 indices_0b = svzip1(tmp_4x, tmp_4x); // 8 times 1, then 8 times 2, ...
858 68 indices_1b = svzip2(tmp_4x, tmp_4x);
859 // next section, e.g. in case of 512-bit regs (=16 x F32), it is 5, 5, 5, 5,
860 // 6, 6, 6, 6, ...
861 68 tmp_4x = svzip2(tmp_2x, tmp_2x);
862 68 indices_2b = svzip1(tmp_4x, tmp_4x);
863 68 indices_3b = svzip2(tmp_4x, tmp_4x);
864 68 }
865 544 std::reference_wrapper<svuint32_t> indices[8] = {
866 272 indices_0a, indices_0b, indices_1a, indices_1b,
867 272 indices_2a, indices_2b, indices_3a, indices_3b};
868
869 68 svfloat32_t coeffs_a, coeffs_b;
870 {
871 // Prepare 1/16, 3/16, 5/16, ..., 15/16, repeated
872 68 svuint32_t linear = svindex_u32(1, 2);
873 136 svfloat32_t repetitive_float = // mod 16
874 68 svcvt_f32_x(svptrue_b32(), svand_n_u32_m(svptrue_b32(), linear, 0x0F));
875 68 coeffs_b = svdiv_n_f32_x(svptrue_b32(), repetitive_float, 16.0F);
876 68 coeffs_a = svsub_x(svptrue_b32(), svdup_f32(1.0F), coeffs_b);
877 68 }
878 68 std::reference_wrapper<svfloat32_t> coeffs_ab[2] = {coeffs_a, coeffs_b};
879
880 41940 auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
881 svfloat32_t b) KLEIDICV_STREAMING {
882 41872 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
883 };
884
885 3156 auto index_and_lerp1d = [](svbool_t pg, svuint32_t indices_a,
886 svuint32_t indices_b,
887 std::reference_wrapper<svfloat32_t> coeffs_ab[2],
888 svfloat32_t src) KLEIDICV_STREAMING {
889 6176 return svmla_f32_x(pg, svmul_f32_x(pg, svtbl(src, indices_a), coeffs_ab[0]),
890 3088 svtbl(src, indices_b), coeffs_ab[1]);
891 };
892
893 // Handle top or bottom edge
894 68 auto process_edge_row =
895 200 [src_width, dst_width, index_and_lerp1d](
896 const float *src_row, float *dst_row, size_t dst_stride,
897 std::reference_wrapper<svuint32_t> indices[8],
898 std::reference_wrapper<svfloat32_t> coeffs_ab[2]) KLEIDICV_STREAMING {
899 // Left elements
900 132 float left = src_row[0];
901 132 float *dst = dst_row;
902
2/2
✓ Branch 0 taken 528 times.
✓ Branch 1 taken 132 times.
660 for (size_t i = 0; i < 4; ++i) {
903 528 *dst++ = left;
904 528 *dst++ = left;
905 528 *dst++ = left;
906 528 *dst = left;
907 528 dst += dst_stride - 3;
908 528 }
909
910 // Middle elements
911
2/2
✓ Branch 0 taken 772 times.
✓ Branch 1 taken 132 times.
904 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
912 772 svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
913 772 svfloat32_t svsrc = svld1_f32(pg, src_row + src_x);
914
915 772 size_t dst_length = 8 * (src_width - src_x - 1);
916 772 svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
917 772 svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
918 772 svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
919 772 svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
920
921 772 float *dst_row0 = dst_row + src_x * 8 + 4;
922 772 float *dst_row1 = dst_row0 + dst_stride;
923 772 float *dst_row2 = dst_row1 + dst_stride;
924 772 float *dst_row3 = dst_row2 + dst_stride;
925 1544 svfloat32_t dst =
926 772 index_and_lerp1d(pg_1, indices[0], indices[1], coeffs_ab, svsrc);
927 772 svst1(pg_1, dst_row0, dst);
928 772 svst1(pg_1, dst_row1, dst);
929 772 svst1(pg_1, dst_row2, dst);
930 772 svst1(pg_1, dst_row3, dst);
931
932 772 dst =
933 772 index_and_lerp1d(pg_2, indices[2], indices[3], coeffs_ab, svsrc);
934 772 svst1_vnum(pg_2, dst_row0, 1, dst);
935 772 svst1_vnum(pg_2, dst_row1, 1, dst);
936 772 svst1_vnum(pg_2, dst_row2, 1, dst);
937 772 svst1_vnum(pg_2, dst_row3, 1, dst);
938
939 772 dst =
940 772 index_and_lerp1d(pg_3, indices[4], indices[5], coeffs_ab, svsrc);
941 772 svst1_vnum(pg_3, dst_row0, 2, dst);
942 772 svst1_vnum(pg_3, dst_row1, 2, dst);
943 772 svst1_vnum(pg_3, dst_row2, 2, dst);
944 772 svst1_vnum(pg_3, dst_row3, 2, dst);
945
946 772 dst =
947 772 index_and_lerp1d(pg_4, indices[6], indices[7], coeffs_ab, svsrc);
948 772 svst1_vnum(pg_4, dst_row0, 3, dst);
949 772 svst1_vnum(pg_4, dst_row1, 3, dst);
950 772 svst1_vnum(pg_4, dst_row2, 3, dst);
951 772 svst1_vnum(pg_4, dst_row3, 3, dst);
952 772 }
953
954 // Right elements
955 132 dst = dst_row + dst_width - 4;
956 132 float right = src_row[src_width - 1];
957
2/2
✓ Branch 0 taken 132 times.
✓ Branch 1 taken 528 times.
660 for (size_t i = 0; i < 4; ++i) {
958 528 *dst++ = right;
959 528 *dst++ = right;
960 528 *dst++ = right;
961 528 *dst = right;
962 528 dst += dst_stride - 3;
963 528 }
964 132 };
965
966 68 svfloat32_t coeffs_p = svmul_n_f32_x(svptrue_b32(), coeffs_a, 15.0 / 16);
967 68 svfloat32_t coeffs_q = svmul_n_f32_x(svptrue_b32(), coeffs_b, 15.0 / 16);
968 68 svfloat32_t coeffs_r = svmul_n_f32_x(svptrue_b32(), coeffs_a, 1.0 / 16);
969 68 svfloat32_t coeffs_s = svmul_n_f32_x(svptrue_b32(), coeffs_b, 1.0 / 16);
970 136 std::reference_wrapper<svfloat32_t> coeffs_pqrs[4] = {coeffs_p, coeffs_q,
971 136 coeffs_r, coeffs_s};
972
973 12660 auto index_and_lerp2d = [](svbool_t pg, svuint32_t indices_a,
974 svuint32_t indices_b,
975 std::reference_wrapper<svfloat32_t> coeffs_pqrs[4],
976 svfloat32_t src0,
977 svfloat32_t src1) KLEIDICV_STREAMING {
978 12592 return svmla_f32_x(
979 12592 pg,
980 12592 svmla_f32_x(
981 12592 pg,
982 25184 svmla_f32_x(pg,
983 12592 svmul_f32_x(pg, svtbl(src0, indices_a), coeffs_pqrs[0]),
984 12592 svtbl(src0, indices_b), coeffs_pqrs[1]),
985 12592 svtbl(src1, indices_a), coeffs_pqrs[2]),
986 12592 svtbl(src1, indices_b), coeffs_pqrs[3]);
987 };
988
989 324 auto process_row = [src_width, dst_width, index_and_lerp2d, lerp1d_vector](
990 const float *src_row0, const float *src_row1,
991 float *dst_row, size_t dst_stride,
992 std::reference_wrapper<svuint32_t> indices[8],
993 std::reference_wrapper<svfloat32_t>
994 coeffs_pqrs[4]) KLEIDICV_STREAMING {
995 // Left edge
996 256 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
997 256 svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements
998 256 float *dst_lr = dst_row;
999 256 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
1000 256 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
1001
2/2
✓ Branch 0 taken 2048 times.
✓ Branch 1 taken 256 times.
2304 for (size_t i = 0; i < 8; ++i) {
1002 4096 svst1(pg4, dst_lr,
1003 4096 lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
1004 2048 static_cast<float>(i * 2 + 1) / 16.0F, s1l));
1005 2048 dst_lr += dst_stride;
1006 2048 }
1007
1008 // Middle elements
1009
2/2
✓ Branch 0 taken 1574 times.
✓ Branch 1 taken 256 times.
1830 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
1010 1574 size_t dst_x = src_x * 8 + 4;
1011
1012 1574 svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
1013 1574 svfloat32_t src_0 = svld1_f32(pg, src_row0 + src_x);
1014 1574 svfloat32_t src_1 = svld1_f32(pg, src_row1 + src_x);
1015
1016 1574 size_t dst_length = 8 * (src_width - src_x - 1);
1017 1574 svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
1018 1574 svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
1019 1574 svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
1020 1574 svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
1021
1022 1574 float *dst_row0 = dst_row + dst_x;
1023 1574 float *dst_row1 = dst_row0 + dst_stride;
1024 1574 float *dst_row2 = dst_row1 + dst_stride;
1025 1574 float *dst_row3 = dst_row2 + dst_stride;
1026 1574 float *dst_row4 = dst_row3 + dst_stride;
1027 1574 float *dst_row5 = dst_row4 + dst_stride;
1028 1574 float *dst_row6 = dst_row5 + dst_stride;
1029 1574 float *dst_row7 = dst_row6 + dst_stride;
1030
1031 3148 svfloat32_t dst_0 = index_and_lerp2d(pg_1, indices[0], indices[1],
1032 1574 coeffs_pqrs, src_0, src_1);
1033 1574 svst1(pg_1, dst_row0, dst_0);
1034 3148 svfloat32_t dst_7 = index_and_lerp2d(pg_1, indices[0], indices[1],
1035 1574 coeffs_pqrs, src_1, src_0);
1036 1574 svst1(pg_1, dst_row7, dst_7);
1037 3148 svst1(pg_1, dst_row1,
1038 1574 lerp1d_vector(pg_1, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1039 3148 svst1(pg_1, dst_row2,
1040 1574 lerp1d_vector(pg_1, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1041 3148 svst1(pg_1, dst_row3,
1042 1574 lerp1d_vector(pg_1, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1043 3148 svst1(pg_1, dst_row4,
1044 1574 lerp1d_vector(pg_1, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1045 3148 svst1(pg_1, dst_row5,
1046 1574 lerp1d_vector(pg_1, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1047 3148 svst1(pg_1, dst_row6,
1048 1574 lerp1d_vector(pg_1, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1049
1050 3148 dst_0 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_0,
1051 1574 src_1);
1052 1574 svst1_vnum(pg_2, dst_row0, 1, dst_0);
1053 3148 dst_7 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_1,
1054 1574 src_0);
1055 1574 svst1_vnum(pg_2, dst_row7, 1, dst_7);
1056 3148 svst1_vnum(pg_2, dst_row1, 1,
1057 1574 lerp1d_vector(pg_2, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1058 3148 svst1_vnum(pg_2, dst_row2, 1,
1059 1574 lerp1d_vector(pg_2, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1060 3148 svst1_vnum(pg_2, dst_row3, 1,
1061 1574 lerp1d_vector(pg_2, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1062 3148 svst1_vnum(pg_2, dst_row4, 1,
1063 1574 lerp1d_vector(pg_2, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1064 3148 svst1_vnum(pg_2, dst_row5, 1,
1065 1574 lerp1d_vector(pg_2, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1066 3148 svst1_vnum(pg_2, dst_row6, 1,
1067 1574 lerp1d_vector(pg_2, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1068
1069 3148 dst_0 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_0,
1070 1574 src_1);
1071 1574 svst1_vnum(pg_3, dst_row0, 2, dst_0);
1072 3148 dst_7 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_1,
1073 1574 src_0);
1074 1574 svst1_vnum(pg_3, dst_row7, 2, dst_7);
1075 3148 svst1_vnum(pg_3, dst_row1, 2,
1076 1574 lerp1d_vector(pg_3, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1077 3148 svst1_vnum(pg_3, dst_row2, 2,
1078 1574 lerp1d_vector(pg_3, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1079 3148 svst1_vnum(pg_3, dst_row3, 2,
1080 1574 lerp1d_vector(pg_3, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1081 3148 svst1_vnum(pg_3, dst_row4, 2,
1082 1574 lerp1d_vector(pg_3, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1083 3148 svst1_vnum(pg_3, dst_row5, 2,
1084 1574 lerp1d_vector(pg_3, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1085 3148 svst1_vnum(pg_3, dst_row6, 2,
1086 1574 lerp1d_vector(pg_3, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1087
1088 3148 dst_0 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_0,
1089 1574 src_1);
1090 1574 svst1_vnum(pg_4, dst_row0, 3, dst_0);
1091 3148 dst_7 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_1,
1092 1574 src_0);
1093 1574 svst1_vnum(pg_4, dst_row7, 3, dst_7);
1094 3148 svst1_vnum(pg_4, dst_row1, 3,
1095 1574 lerp1d_vector(pg_4, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1096 3148 svst1_vnum(pg_4, dst_row2, 3,
1097 1574 lerp1d_vector(pg_4, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1098 3148 svst1_vnum(pg_4, dst_row3, 3,
1099 1574 lerp1d_vector(pg_4, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1100 3148 svst1_vnum(pg_4, dst_row4, 3,
1101 1574 lerp1d_vector(pg_4, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1102 3148 svst1_vnum(pg_4, dst_row5, 3,
1103 1574 lerp1d_vector(pg_4, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1104 3148 svst1_vnum(pg_4, dst_row6, 3,
1105 1574 lerp1d_vector(pg_4, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1106 1574 }
1107
1108 // Right edge
1109 256 dst_lr = dst_row;
1110 256 svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
1111 256 svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
1112
2/2
✓ Branch 0 taken 256 times.
✓ Branch 1 taken 2048 times.
2304 for (size_t i = 0; i < 8; ++i) {
1113 4096 svst1(pg4, dst_lr + dst_width - 4,
1114 4096 lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
1115 2048 static_cast<float>(i * 2 + 1) / 16.0F, s1r));
1116 2048 dst_lr += dst_stride;
1117 2048 }
1118 256 };
1119
1120 // Top rows
1121
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 66 times.
68 if (KLEIDICV_LIKELY(y_begin == 0)) {
1122 66 process_edge_row(src, dst, dst_stride, indices, coeffs_ab);
1123 66 }
1124
1125 // Middle rows
1126
2/2
✓ Branch 0 taken 256 times.
✓ Branch 1 taken 68 times.
324 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
1127 256 size_t dst_y = src_y * 8 + 4;
1128 256 const float *src_row0 = src + src_stride * src_y;
1129 256 const float *src_row1 = src_row0 + src_stride;
1130 512 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
1131 256 indices, coeffs_pqrs);
1132 256 }
1133
1134 // Bottom rows
1135
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 66 times.
68 if (KLEIDICV_LIKELY(y_end == src_height)) {
1136 132 process_edge_row(src + src_stride * (src_height - 1),
1137 66 dst + dst_stride * (dst_height - 4), dst_stride, indices,
1138 66 coeffs_ab);
1139 66 }
1140
1141 68 return KLEIDICV_OK;
1142 68 }
1143
1144 285 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_u8_sc(
1145 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
1146 size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride,
1147 size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
1148
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 282 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 282 times.
285 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1149
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 279 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 279 times.
282 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1150
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 276 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 273 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 273 times.
279 CHECK_IMAGE_SIZE(dst_width, dst_height);
1151
1152
4/4
✓ Branch 0 taken 267 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 264 times.
273 if (src_width == 0 || src_height == 0) {
1153 9 return KLEIDICV_OK;
1154 }
1155
3/4
✓ Branch 0 taken 162 times.
✓ Branch 1 taken 102 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 162 times.
264 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
1156 324 return resize_2x2_u8_sc(src, src_stride, src_width, src_height, y_begin,
1157 162 y_end, dst, dst_stride);
1158 }
1159
2/4
✓ Branch 0 taken 102 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 102 times.
102 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
1160 204 return resize_4x4_u8_sc(src, src_stride, src_width, src_height, y_begin,
1161 102 y_end, dst, dst_stride);
1162 }
1163 // resize_linear_f32_is_implemented checked the kernel size already.
1164 // GCOVR_EXCL_START
1165 assert(!"resize ratio not implemented");
1166 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1167 // GCOVR_EXCL_STOP
1168 285 }
1169
1170 387 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_f32_sc(
1171 const float *src, size_t src_stride, size_t src_width, size_t src_height,
1172 size_t y_begin, size_t y_end, float *dst, size_t dst_stride,
1173 size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
1174
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 384 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 384 times.
387 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1175
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 381 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 381 times.
384 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1176
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 378 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 375 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 375 times.
381 CHECK_IMAGE_SIZE(dst_width, dst_height);
1177
1178
4/4
✓ Branch 0 taken 369 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 366 times.
375 if (src_width == 0 || src_height == 0) {
1179 9 return KLEIDICV_OK;
1180 }
1181
3/4
✓ Branch 0 taken 150 times.
✓ Branch 1 taken 216 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 150 times.
366 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
1182 300 return resize_2x2_f32_sc(src, src_stride, src_width, src_height, y_begin,
1183 150 y_end, dst, dst_stride);
1184 }
1185
3/4
✓ Branch 0 taken 114 times.
✓ Branch 1 taken 102 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 114 times.
216 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
1186 228 return resize_4x4_f32_sc(src, src_stride, src_width, src_height, y_begin,
1187 114 y_end, dst, dst_stride);
1188 }
1189
2/4
✓ Branch 0 taken 102 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 102 times.
102 if (src_width * 8 == dst_width && src_height * 8 == dst_height) {
1190
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 34 times.
102 if (svcntw() >= 8) {
1191 136 return resize_8x8_f32_sve256plus_sc(src, src_stride, src_width,
1192 68 src_height, y_begin, y_end, dst,
1193 68 dst_stride);
1194 }
1195 68 return resize_8x8_f32_sve128_sc(src, src_stride, src_width, src_height,
1196 34 y_begin, y_end, dst, dst_stride);
1197 }
1198 // resize_linear_f32_is_implemented checked the kernel size already.
1199 // GCOVR_EXCL_START
1200 assert(!"resize ratio not implemented");
1201 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1202 // GCOVR_EXCL_STOP
1203 387 }
1204
1205 } // namespace KLEIDICV_TARGET_NAMESPACE
1206
1207 #endif // KLEIDICV_RESIZE_SC_H
1208