KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/resize/resize_linear_sc.h
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 827 828 99.9%
Functions: 90 102 88.2%
Branches: 98 104 94.2%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RESIZE_LINEAR_SC_H
6 #define KLEIDICV_RESIZE_LINEAR_SC_H
7
8 #include <cassert>
9
10 #include "kleidicv/kleidicv.h"
11 #include "kleidicv/sve2.h"
12
13 namespace KLEIDICV_TARGET_NAMESPACE {
14
15 312 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t kleidicv_resize_2x2_u8_sc(
16 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
17 size_t y_begin, size_t y_end, uint8_t *dst,
18 size_t dst_stride) KLEIDICV_STREAMING {
19 312 size_t dst_width = src_width * 2;
20 312 size_t dst_height = src_height * 2;
21
22 5584 auto lerp1d_vector = [](svuint8_t near, svuint8_t far) KLEIDICV_STREAMING {
23 // near * 3
24 5272 svuint16_t near3b = svmullb(near, uint8_t{3});
25 5272 svuint16_t near3t = svmullt(near, uint8_t{3});
26
27 // near * 3 + far
28 5272 svuint16_t near3_far_b = svaddwb(near3b, far);
29 5272 svuint16_t near3_far_t = svaddwt(near3t, far);
30
31 // near * 3 + far + 2
32 5272 svuint16_t near3_far_2b = svaddwb(near3_far_b, uint8_t{2});
33 5272 svuint16_t near3_far_2t = svaddwt(near3_far_t, uint8_t{2});
34
35 // (near * 3 + far + 2) / 4
36 5272 svuint8_t near3_far_2_div4 = svshrnb_n_u16(near3_far_2b, 2);
37 5272 near3_far_2_div4 = svshrnt_n_u16(near3_far_2_div4, near3_far_2t, 2);
38 10544 return near3_far_2_div4;
39 5272 };
40
41 6936 auto lerp2d_vector = [](svbool_t pg, svuint8_t near, svuint8_t mid_a,
42 svuint8_t mid_b, svuint8_t far) KLEIDICV_STREAMING {
43 // near * 9
44 6624 svuint16_t near9b = svmullb(near, uint8_t{9});
45 6624 svuint16_t near9t = svmullt(near, uint8_t{9});
46
47 // mid_a + mid_b
48 6624 svuint16_t midb = svaddlb(mid_a, mid_b);
49 6624 svuint16_t midt = svaddlt(mid_a, mid_b);
50
51 // near * 9 + (mid_a + mid_b) * 3
52 6624 svuint16_t near9_mid3b = svmla_x(pg, near9b, midb, uint16_t{3});
53 6624 svuint16_t near9_mid3t = svmla_x(pg, near9t, midt, uint16_t{3});
54
55 // near * 9 + (mid_a + mid_b) * 3 + far
56 6624 svuint16_t near9_mid3_far_b = svaddwb(near9_mid3b, far);
57 6624 svuint16_t near9_mid3_far_t = svaddwt(near9_mid3t, far);
58
59 // near * 9 + (mid_a + mid_b) * 3 + far + 8
60 6624 svuint16_t near9_mid3_far_8b = svaddwb(near9_mid3_far_b, uint8_t{8});
61 6624 svuint16_t near9_mid3_far_8t = svaddwt(near9_mid3_far_t, uint8_t{8});
62
63 // (near * 9 + (mid_a + mid_b) * 3 + far + 8) / 16
64 6624 svuint8_t near9_mid3_far_8_div16 = svshrnb_n_u16(near9_mid3_far_8b, 4);
65 6624 near9_mid3_far_8_div16 =
66 6624 svshrnt_n_u16(near9_mid3_far_8_div16, near9_mid3_far_8t, 4);
67 13248 return near9_mid3_far_8_div16;
68 6624 };
69
70 // Handle top or bottom edge
71 800 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
72 const uint8_t *src_row,
73 uint8_t *dst_row) KLEIDICV_STREAMING {
74 // Left element
75 488 dst_row[0] = src_row[0];
76
77 // Right element
78 488 dst_row[dst_width - 1] = src_row[src_width - 1];
79
80
2/2
✓ Branch 0 taken 488 times.
✓ Branch 1 taken 796 times.
1284 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
81 796 size_t dst_x = src_x * 2 + 1;
82
83 796 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
84
85 796 svuint8_t src_left = svld1_u8(pg, src_row + src_x);
86 796 svuint8_t src_right = svld1_u8(pg, src_row + src_x + 1);
87
88 796 svuint8_t dst_left = lerp1d_vector(src_left, src_right);
89 796 svuint8_t dst_right = lerp1d_vector(src_right, src_left);
90
91 796 svst2_u8(pg, dst_row + dst_x, svcreate2(dst_left, dst_right));
92 796 }
93 488 };
94
95 1232 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
96 const uint8_t *src_row0, const uint8_t *src_row1,
97 uint8_t *dst_row0,
98 uint8_t *dst_row1) KLEIDICV_STREAMING {
99 // Left elements
100 920 svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read/write 1 element
101 {
102 920 svuint8_t s0 = svld1(pg1, src_row0);
103 920 svuint8_t s1 = svld1(pg1, src_row1);
104 920 svst1(pg1, dst_row0, lerp1d_vector(s0, s1));
105 920 svst1(pg1, dst_row1, lerp1d_vector(s1, s0));
106 920 }
107
108 // Middle elements
109
2/2
✓ Branch 0 taken 920 times.
✓ Branch 1 taken 1656 times.
2576 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
110 1656 size_t dst_x = src_x * 2 + 1;
111
112 1656 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
113
114 1656 svuint8_t src_tl = svld1_u8(pg, src_row0 + src_x);
115 1656 svuint8_t src_tr = svld1_u8(pg, src_row0 + src_x + 1);
116 1656 svuint8_t src_bl = svld1_u8(pg, src_row1 + src_x);
117 1656 svuint8_t src_br = svld1_u8(pg, src_row1 + src_x + 1);
118
119 1656 svuint8_t dst_tl = lerp2d_vector(pg, src_tl, src_tr, src_bl, src_br);
120 1656 svuint8_t dst_tr = lerp2d_vector(pg, src_tr, src_tl, src_br, src_bl);
121 1656 svuint8_t dst_bl = lerp2d_vector(pg, src_bl, src_tl, src_br, src_tr);
122 1656 svuint8_t dst_br = lerp2d_vector(pg, src_br, src_tr, src_bl, src_tl);
123
124 1656 svst2_u8(pg, dst_row0 + dst_x, svcreate2(dst_tl, dst_tr));
125 1656 svst2_u8(pg, dst_row1 + dst_x, svcreate2(dst_bl, dst_br));
126 1656 }
127
128 // Right elements
129 920 svuint8_t s0 = svld1(pg1, src_row0 + src_width - 1);
130 920 svuint8_t s1 = svld1(pg1, src_row1 + src_width - 1);
131 920 svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(s0, s1));
132 920 svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(s1, s0));
133 920 };
134
135 // Top row
136
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 244 times.
312 if (KLEIDICV_LIKELY(y_begin == 0)) {
137 244 process_edge_row(src, dst);
138 244 }
139
140 // Middle rows
141
2/2
✓ Branch 0 taken 920 times.
✓ Branch 1 taken 312 times.
1232 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
142 920 size_t dst_y = src_y * 2 + 1;
143 920 const uint8_t *src_row0 = src + src_stride * src_y;
144 920 const uint8_t *src_row1 = src_row0 + src_stride;
145 920 uint8_t *dst_row0 = dst + dst_stride * dst_y;
146 920 uint8_t *dst_row1 = dst_row0 + dst_stride;
147
148 920 process_row(src_row0, src_row1, dst_row0, dst_row1);
149 920 }
150
151 // Bottom row
152
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 244 times.
312 if (KLEIDICV_LIKELY(y_end == src_height)) {
153 488 process_edge_row(src + src_stride * (src_height - 1),
154 244 dst + dst_stride * (dst_height - 1));
155 244 }
156
157 312 return KLEIDICV_OK;
158 312 }
159
160 232 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t kleidicv_resize_4x4_u8_sc(
161 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
162 size_t y_begin, size_t y_end, uint8_t *dst,
163 size_t dst_stride) KLEIDICV_STREAMING {
164 232 size_t dst_width = src_width * 4;
165 232 size_t dst_height = src_height * 4;
166
167 9624 auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b)
168 KLEIDICV_STREAMING {
169 // bias
170 9392 svuint16_t top = svdup_u16(4);
171
172 // bias + a * p
173 9392 svuint16_t bot = svmlalb(top, a, p);
174 9392 top = svmlalt(top, a, p);
175
176 // bias + a * p + b * q
177 9392 bot = svmlalb(bot, b, q);
178 9392 top = svmlalt(top, b, q);
179
180 // (bias + a * p + b * q) / 8
181 9392 svuint8_t result = svshrnb(bot, 3ULL);
182 9392 result = svshrnt(result, top, 3ULL);
183 18784 return result;
184 9392 };
185
186 25448 auto lerp2d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b,
187 uint8_t r, svuint8_t c, uint8_t s,
188 svuint8_t d) KLEIDICV_STREAMING {
189 // bias
190 25216 svuint16_t top = svdup_u16(32);
191
192 // bias + a * p
193 25216 svuint16_t bot = svmlalb(top, a, p);
194 25216 top = svmlalt(top, a, p);
195
196 // bias + a * p + b * q
197 25216 bot = svmlalb(bot, b, q);
198 25216 top = svmlalt(top, b, q);
199
200 // bias + a * p + b * q + c * r
201 25216 bot = svmlalb(bot, c, r);
202 25216 top = svmlalt(top, c, r);
203
204 // bias + a * p + b * q + c * r + d * s
205 25216 bot = svmlalb(bot, d, s);
206 25216 top = svmlalt(top, d, s);
207
208 // (bias + a * p + b * q + c * r + d * s) / 64
209 25216 svuint8_t result = svshrnt(svshrnb(bot, 6ULL), top, 6ULL);
210 50432 return result;
211 25216 };
212
213 // Handle top or bottom edge
214 560 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
215 const uint8_t *src_row,
216 uint8_t *dst_row) KLEIDICV_STREAMING {
217 // Left elements
218 328 dst_row[1] = dst_row[0] = src_row[0];
219
220 // Right elements
221 328 dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
222
223 // Middle elements
224
2/2
✓ Branch 0 taken 328 times.
✓ Branch 1 taken 668 times.
996 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
225 668 size_t dst_x = src_x * 4 + 2;
226 668 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
227 668 svuint8_t a = svld1_u8(pg, src_row + src_x);
228 668 svuint8_t b = svld1_u8(pg, src_row + src_x + 1);
229 1336 svst4_u8(pg, dst_row + dst_x,
230 1336 svcreate4(lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b),
231 668 lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)));
232 668 }
233 328 };
234
235 1072 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
236 const uint8_t *src_row0, const uint8_t *src_row1,
237 uint8_t *dst_row0, uint8_t *dst_row1,
238 uint8_t *dst_row2,
239 uint8_t *dst_row3) KLEIDICV_STREAMING {
240 // Left elements
241 840 svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read 1 element
242 840 svbool_t pg2 = svptrue_pat_b8(SV_VL2); // write 2 elements
243 {
244 840 svuint8_t s0 = svdup_lane(svld1(pg1, src_row0), 0);
245 840 svuint8_t s1 = svdup_lane(svld1(pg1, src_row1), 0);
246 840 svst1(pg2, dst_row0, lerp1d_vector(7, s0, 1, s1));
247 840 svst1(pg2, dst_row1, lerp1d_vector(5, s0, 3, s1));
248 840 svst1(pg2, dst_row2, lerp1d_vector(3, s0, 5, s1));
249 840 svst1(pg2, dst_row3, lerp1d_vector(1, s0, 7, s1));
250 840 }
251
252 // Middle elements
253
2/2
✓ Branch 0 taken 840 times.
✓ Branch 1 taken 1576 times.
2416 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
254 1576 size_t dst_x = src_x * 4 + 2;
255
256 1576 svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
257
258 1576 svuint8_t a = svld1_u8(pg, src_row0 + src_x);
259 1576 svuint8_t b = svld1_u8(pg, src_row0 + src_x + 1);
260 1576 svuint8_t c = svld1_u8(pg, src_row1 + src_x);
261 1576 svuint8_t d = svld1_u8(pg, src_row1 + src_x + 1);
262
263 3152 svst4_u8(pg, dst_row0 + dst_x,
264 3152 (svcreate4(lerp2d_vector(49, a, 7, b, 7, c, 1, d),
265 1576 lerp2d_vector(35, a, 21, b, 5, c, 3, d),
266 1576 lerp2d_vector(21, a, 35, b, 3, c, 5, d),
267 1576 lerp2d_vector(49, b, 7, a, 7, d, 1, c))));
268
269 3152 svst4_u8(pg, dst_row1 + dst_x,
270 3152 (svcreate4(lerp2d_vector(35, a, 5, b, 21, c, 3, d),
271 1576 lerp2d_vector(25, a, 15, b, 15, c, 9, d),
272 1576 lerp2d_vector(15, a, 25, b, 9, c, 15, d),
273 1576 lerp2d_vector(5, a, 35, b, 3, c, 21, d))));
274 3152 svst4_u8(pg, dst_row2 + dst_x,
275 3152 (svcreate4(lerp2d_vector(21, a, 3, b, 35, c, 5, d),
276 1576 lerp2d_vector(15, a, 9, b, 25, c, 15, d),
277 1576 lerp2d_vector(9, a, 15, b, 15, c, 25, d),
278 1576 lerp2d_vector(3, a, 21, b, 5, c, 35, d))));
279 3152 svst4_u8(pg, dst_row3 + dst_x,
280 3152 (svcreate4(lerp2d_vector(49, c, 7, a, 7, d, 1, b),
281 1576 lerp2d_vector(5, a, 3, b, 35, c, 21, d),
282 1576 lerp2d_vector(3, a, 5, b, 21, c, 35, d),
283 1576 lerp2d_vector(49, d, 7, b, 7, c, 1, a))));
284 1576 }
285
286 // Right elements
287 840 svuint8_t s0 = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
288 840 svuint8_t s1 = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
289 840 svst1(pg2, dst_row0 + dst_width - 2, lerp1d_vector(7, s0, 1, s1));
290 840 svst1(pg2, dst_row1 + dst_width - 2, lerp1d_vector(5, s0, 3, s1));
291 840 svst1(pg2, dst_row2 + dst_width - 2, lerp1d_vector(3, s0, 5, s1));
292 840 svst1(pg2, dst_row3 + dst_width - 2, lerp1d_vector(1, s0, 7, s1));
293 840 };
294
295 560 auto copy_dst_row = [src_width](const uint8_t *dst_from,
296 uint8_t *dst_to) KLEIDICV_STREAMING {
297
2/2
✓ Branch 0 taken 328 times.
✓ Branch 1 taken 750 times.
1078 for (size_t i = 0; i < src_width; i += svcntb()) {
298 750 svbool_t pg = svwhilelt_b8_u64(i, src_width);
299 750 svst4(pg, dst_to + i * 4, svld4(pg, dst_from + i * 4));
300 750 }
301 328 };
302
303 // Top rows
304
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 164 times.
232 if (KLEIDICV_LIKELY(y_begin == 0)) {
305 164 process_edge_row(src, dst);
306 164 copy_dst_row(dst, dst + dst_stride);
307 164 }
308
309 // Middle rows
310
2/2
✓ Branch 0 taken 840 times.
✓ Branch 1 taken 232 times.
1072 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
311 840 size_t dst_y = src_y * 4 + 2;
312 840 const uint8_t *src_row0 = src + src_stride * src_y;
313 840 const uint8_t *src_row1 = src_row0 + src_stride;
314 840 uint8_t *dst_row0 = dst + dst_stride * dst_y;
315 840 uint8_t *dst_row1 = dst_row0 + dst_stride;
316 840 uint8_t *dst_row2 = dst_row1 + dst_stride;
317 840 uint8_t *dst_row3 = dst_row2 + dst_stride;
318
319 840 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
320 840 }
321
322 // Bottom rows
323
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 164 times.
232 if (KLEIDICV_LIKELY(y_end == src_height)) {
324 328 process_edge_row(src + src_stride * (src_height - 1),
325 164 dst + dst_stride * (dst_height - 2));
326 328 copy_dst_row(dst + dst_stride * (dst_height - 2),
327 164 dst + dst_stride * (dst_height - 1));
328 164 }
329
330 232 return KLEIDICV_OK;
331 232 }
332
333 296 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc(
334 const float *src, size_t src_stride, size_t src_width, size_t src_height,
335 size_t y_begin, size_t y_end, float *dst,
336 size_t dst_stride) KLEIDICV_STREAMING {
337 296 size_t dst_width = src_width * 2;
338 296 src_stride /= sizeof(float);
339 296 dst_stride /= sizeof(float);
340
341 8164 auto lerp1d_vector = [](svbool_t pg, svfloat32_t near,
342 svfloat32_t far) KLEIDICV_STREAMING {
343 7868 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.75F), far, 0.25F);
344 };
345
346 17992 auto lerp2d_vector = [](svbool_t pg, svfloat32_t near, svfloat32_t mid_a,
347 svfloat32_t mid_b,
348 svfloat32_t far) KLEIDICV_STREAMING {
349 17696 return svmla_n_f32_x(
350 17696 pg,
351 17696 svmla_n_f32_x(
352 17696 pg,
353 17696 svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.5625F), mid_a, 0.1875F),
354 17696 mid_b, 0.1875F),
355 17696 far, 0.0625F);
356 };
357
358 // Handle top or bottom edge
359 752 auto process_edge_row = [src_width, dst_width, lerp1d_vector](
360 const float *src_row,
361 float *dst_row) KLEIDICV_STREAMING {
362 // Left element
363 456 dst_row[0] = src_row[0];
364
365 // Middle elements
366
2/2
✓ Branch 0 taken 456 times.
✓ Branch 1 taken 2094 times.
2550 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
367 2094 size_t dst_x = src_x * 2 + 1;
368
369 2094 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
370
371 2094 svfloat32_t a = svld1_f32(pg, src_row + src_x);
372 2094 svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
373
374 4188 svst2_f32(pg, dst_row + dst_x,
375 2094 svcreate2(lerp1d_vector(pg, a, b), lerp1d_vector(pg, b, a)));
376 2094 }
377
378 // Right element
379 456 dst_row[dst_width - 1] = src_row[src_width - 1];
380 456 };
381
382 1216 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
383 const float *src_row0, const float *src_row1,
384 float *dst_row0, float *dst_row1) KLEIDICV_STREAMING {
385 // Left elements
386 920 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read/write 1 element
387 {
388 920 svfloat32_t s0 = svld1(pg1, src_row0);
389 920 svfloat32_t s1 = svld1(pg1, src_row1);
390 920 svst1(pg1, dst_row0, lerp1d_vector(pg1, s0, s1));
391 920 svst1(pg1, dst_row1, lerp1d_vector(pg1, s1, s0));
392 920 }
393
394 // Middle elements
395
2/2
✓ Branch 0 taken 920 times.
✓ Branch 1 taken 4424 times.
5344 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
396 4424 size_t dst_x = src_x * 2 + 1;
397
398 4424 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
399
400 4424 svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
401 4424 svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
402 4424 svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
403 4424 svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
404
405 8848 svst2_f32(pg, dst_row0 + dst_x,
406 8848 svcreate2(lerp2d_vector(pg, a, b, c, d),
407 4424 lerp2d_vector(pg, b, a, d, c)));
408 8848 svst2_f32(pg, dst_row1 + dst_x,
409 8848 svcreate2(lerp2d_vector(pg, c, a, d, b),
410 4424 lerp2d_vector(pg, d, b, c, a)));
411 4424 }
412
413 // Right elements
414 920 svfloat32_t s0 = svld1(pg1, src_row0 + src_width - 1);
415 920 svfloat32_t s1 = svld1(pg1, src_row1 + src_width - 1);
416 920 svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(pg1, s0, s1));
417 920 svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(pg1, s1, s0));
418 920 };
419
420 // Top row
421
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 228 times.
296 if (KLEIDICV_LIKELY(y_begin == 0)) {
422 228 process_edge_row(src, dst);
423 228 }
424 // Middle rows
425
2/2
✓ Branch 0 taken 920 times.
✓ Branch 1 taken 296 times.
1216 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
426 920 size_t dst_y = src_y * 2 + 1;
427 920 const float *src_row0 = src + src_stride * src_y;
428 920 const float *src_row1 = src_row0 + src_stride;
429 920 float *dst_row0 = dst + dst_stride * dst_y;
430 920 float *dst_row1 = dst_row0 + dst_stride;
431
432 920 process_row(src_row0, src_row1, dst_row0, dst_row1);
433 920 }
434
435 // Bottom row
436
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 228 times.
296 if (KLEIDICV_LIKELY(y_end == src_height)) {
437 456 process_edge_row(src + src_stride * (src_height - 1),
438 228 dst + dst_stride * (src_height * 2 - 1));
439 228 }
440
441 296 return KLEIDICV_OK;
442 296 }
443
444 248 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc(
445 const float *src, size_t src_stride, size_t src_width, size_t src_height,
446 size_t y_begin, size_t y_end, float *dst,
447 size_t dst_stride) KLEIDICV_STREAMING {
448 248 size_t dst_width = src_width * 4;
449 248 size_t dst_height = src_height * 4;
450 248 src_stride /= sizeof(float);
451 248 dst_stride /= sizeof(float);
452
453 50096 auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
454 svfloat32_t b) KLEIDICV_STREAMING {
455 49848 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
456 };
457
458 35128 auto lerp2d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
459 svfloat32_t b, float r, svfloat32_t c, float s,
460 svfloat32_t d) KLEIDICV_STREAMING {
461 34880 return svmla_n_f32_x(
462 34880 pg,
463 69760 svmla_n_f32_x(pg, svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q), c,
464 34880 r),
465 34880 d, s);
466 };
467
468 // Handle top or bottom edge
469 608 auto process_edge_row = [src_width, dst_width, dst_stride, lerp1d_vector](
470 const float *src_row,
471 float *dst_row) KLEIDICV_STREAMING {
472 // Left elements
473 360 dst_row[1] = dst_row[0] = dst_row[dst_stride + 1] = dst_row[dst_stride] =
474 360 src_row[0];
475
476 // Right elements
477 360 dst_row[dst_width - 1] = dst_row[dst_width - 2] =
478 360 dst_row[dst_stride + dst_width - 1] =
479 360 dst_row[dst_stride + dst_width - 2] = src_row[src_width - 1];
480
481 // Middle elements
482
2/2
✓ Branch 0 taken 360 times.
✓ Branch 1 taken 2030 times.
2390 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
483 2030 size_t dst_x = src_x * 4 + 2;
484 2030 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
485 2030 svfloat32_t a = svld1_f32(pg, src_row + src_x);
486 2030 svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
487 4060 svfloat32x4_t result = svcreate4(lerp1d_vector(pg, 0.875F, a, 0.125F, b),
488 2030 lerp1d_vector(pg, 0.625F, a, 0.375F, b),
489 2030 lerp1d_vector(pg, 0.375F, a, 0.625F, b),
490 2030 lerp1d_vector(pg, 0.125F, a, 0.875F, b));
491 2030 svst4_f32(pg, dst_row + dst_x, result);
492 2030 svst4_f32(pg, dst_row + dst_stride + dst_x, result);
493 2030 }
494 360 };
495
496 1104 auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
497 const float *src_row0, const float *src_row1,
498 float *dst_row0, float *dst_row1, float *dst_row2,
499 float *dst_row3) KLEIDICV_STREAMING {
500 // Left elements
501 856 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
502 856 svbool_t pg2 = svptrue_pat_b32(SV_VL2); // write 2 elements
503 856 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
504 856 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
505 856 svst1(pg2, dst_row0, lerp1d_vector(pg2, 0.875F, s0l, 0.125F, s1l));
506 856 svst1(pg2, dst_row1, lerp1d_vector(pg2, 0.625F, s0l, 0.375F, s1l));
507 856 svst1(pg2, dst_row2, lerp1d_vector(pg2, 0.375F, s0l, 0.625F, s1l));
508 856 svst1(pg2, dst_row3, lerp1d_vector(pg2, 0.125F, s0l, 0.875F, s1l));
509
510 // Middle elements
511
2/2
✓ Branch 0 taken 856 times.
✓ Branch 1 taken 4360 times.
5216 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
512 4360 size_t dst_x = src_x * 4 + 2;
513
514 4360 svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
515
516 4360 svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
517 4360 svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
518 4360 svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
519 4360 svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
520
521 8720 svfloat32x4_t dst_a =
522 13080 svcreate4(lerp2d_vector(pg, 0.765625F, a, 0.109375F, b, 0.109375F, c,
523 4360 0.015625F, d),
524 8720 lerp2d_vector(pg, 0.546875F, a, 0.328125F, b, 0.078125F, c,
525 4360 0.046875F, d),
526 8720 lerp2d_vector(pg, 0.328125F, a, 0.546875F, b, 0.046875F, c,
527 4360 0.078125F, d),
528 8720 lerp2d_vector(pg, 0.109375F, a, 0.765625F, b, 0.015625F, c,
529 4360 0.109375F, d));
530 8720 svfloat32x4_t dst_d =
531 13080 svcreate4(lerp2d_vector(pg, 0.109375F, a, 0.015625F, b, 0.765625F, c,
532 4360 0.109375F, d),
533 8720 lerp2d_vector(pg, 0.078125F, a, 0.046875F, b, 0.546875F, c,
534 4360 0.328125F, d),
535 8720 lerp2d_vector(pg, 0.046875F, a, 0.078125F, b, 0.328125F, c,
536 4360 0.546875F, d),
537 8720 lerp2d_vector(pg, 0.015625F, a, 0.109375F, b, 0.109375F, c,
538 4360 0.765625F, d));
539 4360 const float one_3rd = 0.3333333333333333F;
540 4360 const float two_3rd = 0.6666666666666667F;
541 4360 svst4_f32(pg, dst_row0 + dst_x, dst_a);
542 8720 svst4_f32(pg, dst_row1 + dst_x,
543 13080 svcreate4(lerp1d_vector(pg, two_3rd, svget4(dst_a, 0), one_3rd,
544 4360 svget4(dst_d, 0)),
545 8720 lerp1d_vector(pg, two_3rd, svget4(dst_a, 1), one_3rd,
546 4360 svget4(dst_d, 1)),
547 8720 lerp1d_vector(pg, two_3rd, svget4(dst_a, 2), one_3rd,
548 4360 svget4(dst_d, 2)),
549 8720 lerp1d_vector(pg, two_3rd, svget4(dst_a, 3), one_3rd,
550 4360 svget4(dst_d, 3))));
551 8720 svst4_f32(pg, dst_row2 + dst_x,
552 13080 svcreate4(lerp1d_vector(pg, one_3rd, svget4(dst_a, 0), two_3rd,
553 4360 svget4(dst_d, 0)),
554 8720 lerp1d_vector(pg, one_3rd, svget4(dst_a, 1), two_3rd,
555 4360 svget4(dst_d, 1)),
556 8720 lerp1d_vector(pg, one_3rd, svget4(dst_a, 2), two_3rd,
557 4360 svget4(dst_d, 2)),
558 8720 lerp1d_vector(pg, one_3rd, svget4(dst_a, 3), two_3rd,
559 4360 svget4(dst_d, 3))));
560 4360 svst4_f32(pg, dst_row3 + dst_x, dst_d);
561 4360 }
562
563 // Right elements
564 856 svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
565 856 svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
566 1712 svst1(pg2, dst_row0 + dst_width - 2,
567 856 lerp1d_vector(pg2, 0.875F, s0r, 0.125F, s1r));
568 1712 svst1(pg2, dst_row1 + dst_width - 2,
569 856 lerp1d_vector(pg2, 0.625F, s0r, 0.375F, s1r));
570 1712 svst1(pg2, dst_row2 + dst_width - 2,
571 856 lerp1d_vector(pg2, 0.375F, s0r, 0.625F, s1r));
572 1712 svst1(pg2, dst_row3 + dst_width - 2,
573 856 lerp1d_vector(pg2, 0.125F, s0r, 0.875F, s1r));
574 856 };
575
576 // Top rows
577
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 180 times.
248 if (KLEIDICV_LIKELY(y_begin == 0)) {
578 180 process_edge_row(src, dst);
579 180 }
580
581 // Middle rows
582
2/2
✓ Branch 0 taken 856 times.
✓ Branch 1 taken 248 times.
1104 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
583 856 size_t dst_y = src_y * 4 + 2;
584 856 const float *src_row0 = src + src_stride * src_y;
585 856 const float *src_row1 = src_row0 + src_stride;
586 856 float *dst_row0 = dst + dst_stride * dst_y;
587 856 float *dst_row1 = dst_row0 + dst_stride;
588 856 float *dst_row2 = dst_row1 + dst_stride;
589 856 float *dst_row3 = dst_row2 + dst_stride;
590
591 856 process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
592 856 }
593
594 // Bottom rows
595
2/2
✓ Branch 0 taken 68 times.
✓ Branch 1 taken 180 times.
248 if (KLEIDICV_LIKELY(y_end == src_height)) {
596 360 process_edge_row(src + src_stride * (src_height - 1),
597 180 dst + dst_stride * (dst_height - 2));
598 180 }
599 248 return KLEIDICV_OK;
600 248 }
601
602 58 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc(
603 const float *src, size_t src_stride, size_t src_width, size_t src_height,
604 size_t y_begin, size_t y_end, float *dst,
605 size_t dst_stride) KLEIDICV_STREAMING {
606 58 size_t dst_width = src_width * 8;
607 58 size_t dst_height = src_height * 8;
608 58 src_stride /= sizeof(float);
609 58 dst_stride /= sizeof(float);
610
611 58 float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0,
612 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0};
613 58 float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0,
614 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0};
615 58 svfloat32_t coeffs_a0 = svld1(svptrue_b32(), &coeffs_a[0]);
616 58 svfloat32_t coeffs_a1 = svld1(svptrue_b32(), &coeffs_a[4]);
617 58 svfloat32_t coeffs_b0 = svld1(svptrue_b32(), &coeffs_b[0]);
618 58 svfloat32_t coeffs_b1 = svld1(svptrue_b32(), &coeffs_b[4]);
619 116 std::reference_wrapper<svfloat32_t> coeffs_ab[4] = {coeffs_a0, coeffs_a1,
620 116 coeffs_b0, coeffs_b1};
621
622 124450 auto lerp1d_vector_n = [](svbool_t pg, float p, svfloat32_t a, float q,
623 svfloat32_t b) KLEIDICV_STREAMING {
624 124392 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
625 };
626
627 9330 auto lerp1d_vector = [](svbool_t pg, svfloat32_t p, svfloat32_t a,
628 svfloat32_t q, svfloat32_t b) KLEIDICV_STREAMING {
629 9272 return svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q);
630 };
631
632 // Handle top or bottom edge
633 58 auto process_edge_row =
634 140 [src_width, dst_width, lerp1d_vector](
635 const float *src_row, float *dst_row, size_t dst_stride,
636 std::reference_wrapper<svfloat32_t> coeffs_ab[4]) KLEIDICV_STREAMING {
637 // Left elements
638 82 float left = src_row[0];
639 82 float *dst = dst_row;
640
2/2
✓ Branch 0 taken 328 times.
✓ Branch 1 taken 82 times.
410 for (size_t i = 0; i < 4; ++i) {
641 328 *dst++ = left;
642 328 *dst++ = left;
643 328 *dst++ = left;
644 328 *dst = left;
645 328 dst += dst_stride - 3;
646 328 }
647
648 // Middle elements
649 82 svfloat32_t a, b = svdup_n_f32(src_row[0]);
650
2/2
✓ Branch 0 taken 4636 times.
✓ Branch 1 taken 82 times.
4718 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
651 4636 a = b;
652 4636 b = svdup_n_f32(src_row[src_x + 1]);
653 4636 float *dst_row0 = dst_row + src_x * 8 + 4;
654 4636 float *dst_row1 = dst_row0 + dst_stride;
655 4636 float *dst_row2 = dst_row1 + dst_stride;
656 4636 float *dst_row3 = dst_row2 + dst_stride;
657 9272 svfloat32_t dst =
658 4636 lerp1d_vector(svptrue_b32(), coeffs_ab[0], a, coeffs_ab[2], b);
659 4636 svst1(svptrue_b32(), dst_row0, dst);
660 4636 svst1(svptrue_b32(), dst_row1, dst);
661 4636 svst1(svptrue_b32(), dst_row2, dst);
662 4636 svst1(svptrue_b32(), dst_row3, dst);
663 4636 dst = lerp1d_vector(svptrue_b32(), coeffs_ab[1], a, coeffs_ab[3], b);
664 4636 svst1(svptrue_b32(), dst_row0 + 4, dst);
665 4636 svst1(svptrue_b32(), dst_row1 + 4, dst);
666 4636 svst1(svptrue_b32(), dst_row2 + 4, dst);
667 4636 svst1(svptrue_b32(), dst_row3 + 4, dst);
668 4636 }
669
670 // Right elements
671 82 dst = dst_row + dst_width - 4;
672 82 float right = src_row[src_width - 1];
673
2/2
✓ Branch 0 taken 82 times.
✓ Branch 1 taken 328 times.
410 for (size_t i = 0; i < 4; ++i) {
674 328 *dst++ = right;
675 328 *dst++ = right;
676 328 *dst++ = right;
677 328 *dst = right;
678 328 dst += dst_stride - 3;
679 328 }
680 82 };
681
682 58 svfloat32_t coeffs_p0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 15.0 / 16);
683 58 svfloat32_t coeffs_q0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 15.0 / 16);
684 58 svfloat32_t coeffs_r0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 1.0 / 16);
685 58 svfloat32_t coeffs_s0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 1.0 / 16);
686 58 svfloat32_t coeffs_p1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 15.0 / 16);
687 58 svfloat32_t coeffs_q1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 15.0 / 16);
688 58 svfloat32_t coeffs_r1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 1.0 / 16);
689 58 svfloat32_t coeffs_s1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 1.0 / 16);
690
691 464 std::reference_wrapper<svfloat32_t> coeffs_pqrs[8] = {
692 232 coeffs_p0, coeffs_p1, coeffs_q0, coeffs_q1,
693 232 coeffs_r0, coeffs_r1, coeffs_s0, coeffs_s1,
694 };
695
696 40402 auto lerp2d_vector = [](svbool_t pg, svfloat32_t a, svfloat32_t p,
697 svfloat32_t b, svfloat32_t q, svfloat32_t c,
698 svfloat32_t r, svfloat32_t d,
699 svfloat32_t s) KLEIDICV_STREAMING {
700 40344 return svmla_f32_x(
701 40344 pg, svmla_f32_x(pg, svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q), c, r),
702 40344 d, s);
703 };
704
705 268 auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n](
706 const float *src_row0, const float *src_row1,
707 float *dst_row0, size_t dst_stride,
708 std::reference_wrapper<svfloat32_t>
709 coeffs_pqrs[8]) KLEIDICV_STREAMING {
710 // Left elements
711 210 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
712 210 svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements
713 210 float *dst_lr = dst_row0;
714 210 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
715 210 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
716
2/2
✓ Branch 0 taken 1680 times.
✓ Branch 1 taken 210 times.
1890 for (size_t i = 0; i < 8; ++i) {
717 3360 svst1(pg4, dst_lr,
718 3360 lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
719 1680 static_cast<float>(i * 2 + 1) / 16.0F, s1l));
720 1680 dst_lr += dst_stride;
721 1680 }
722
723 // Middle elements
724 210 dst_row0 += 4;
725 210 float *dst_row1 = dst_row0 + dst_stride;
726 210 float *dst_row2 = dst_row1 + dst_stride;
727 210 float *dst_row3 = dst_row2 + dst_stride;
728 210 float *dst_row4 = dst_row3 + dst_stride;
729 210 float *dst_row5 = dst_row4 + dst_stride;
730 210 float *dst_row6 = dst_row5 + dst_stride;
731 210 float *dst_row7 = dst_row6 + dst_stride;
732 210 svfloat32_t a, b = s0l;
733 210 svfloat32_t c, d = s1l;
734
2/2
✓ Branch 0 taken 10086 times.
✓ Branch 1 taken 210 times.
10296 for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
735 10086 a = b;
736 10086 b = svdup_lane(svld1(pg1, src_row0 + src_x + 1), 0);
737 10086 c = d;
738 10086 d = svdup_lane(svld1(pg1, src_row1 + src_x + 1), 0);
739 20172 svfloat32_t dst_0 =
740 20172 lerp2d_vector(svptrue_b32(), coeffs_pqrs[0], a, coeffs_pqrs[2], b,
741 10086 coeffs_pqrs[4], c, coeffs_pqrs[6], d);
742 10086 svst1(svptrue_b32(), dst_row0, dst_0);
743 20172 svfloat32_t dst_7 =
744 20172 lerp2d_vector(svptrue_b32(), coeffs_pqrs[4], a, coeffs_pqrs[6], b,
745 10086 coeffs_pqrs[0], c, coeffs_pqrs[2], d);
746 10086 svst1(svptrue_b32(), dst_row7, dst_7);
747 20172 svst1(svptrue_b32(), dst_row1,
748 10086 lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
749 20172 svst1(svptrue_b32(), dst_row2,
750 10086 lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
751 20172 svst1(svptrue_b32(), dst_row3,
752 10086 lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
753 20172 svst1(svptrue_b32(), dst_row4,
754 10086 lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
755 20172 svst1(svptrue_b32(), dst_row5,
756 10086 lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
757 20172 svst1(svptrue_b32(), dst_row6,
758 10086 lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
759 10086 dst_row0 += 4;
760 10086 dst_row1 += 4;
761 10086 dst_row2 += 4;
762 10086 dst_row3 += 4;
763 10086 dst_row4 += 4;
764 10086 dst_row5 += 4;
765 10086 dst_row6 += 4;
766 10086 dst_row7 += 4;
767 20172 dst_0 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[1], a, coeffs_pqrs[3], b,
768 10086 coeffs_pqrs[5], c, coeffs_pqrs[7], d);
769 10086 svst1(svptrue_b32(), dst_row0, dst_0);
770 20172 dst_7 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[5], a, coeffs_pqrs[7], b,
771 10086 coeffs_pqrs[1], c, coeffs_pqrs[3], d);
772 10086 svst1(svptrue_b32(), dst_row7, dst_7);
773 20172 svst1(svptrue_b32(), dst_row1,
774 10086 lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
775 20172 svst1(svptrue_b32(), dst_row2,
776 10086 lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
777 20172 svst1(svptrue_b32(), dst_row3,
778 10086 lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
779 20172 svst1(svptrue_b32(), dst_row4,
780 10086 lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
781 20172 svst1(svptrue_b32(), dst_row5,
782 10086 lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
783 20172 svst1(svptrue_b32(), dst_row6,
784 10086 lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
785 10086 dst_row0 += 4;
786 10086 dst_row1 += 4;
787 10086 dst_row2 += 4;
788 10086 dst_row3 += 4;
789 10086 dst_row4 += 4;
790 10086 dst_row5 += 4;
791 10086 dst_row6 += 4;
792 10086 dst_row7 += 4;
793 10086 }
794
795 // Right elements
796 210 dst_lr = dst_row0;
797 210 svfloat32_t s0r = b;
798 210 svfloat32_t s1r = d;
799
2/2
✓ Branch 0 taken 210 times.
✓ Branch 1 taken 1680 times.
1890 for (size_t i = 0; i < 8; ++i) {
800 3360 svst1(pg4, dst_lr,
801 3360 lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
802 1680 static_cast<float>(i * 2 + 1) / 16.0F, s1r));
803 1680 dst_lr += dst_stride;
804 1680 }
805 210 };
806
807 // Top rows
808
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
58 if (KLEIDICV_LIKELY(y_begin == 0)) {
809 41 process_edge_row(src, dst, dst_stride, coeffs_ab);
810 41 }
811
812 // Middle rows
813
2/2
✓ Branch 0 taken 210 times.
✓ Branch 1 taken 58 times.
268 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
814 210 size_t dst_y = src_y * 8 + 4;
815 210 const float *src_row0 = src + src_stride * src_y;
816 210 const float *src_row1 = src_row0 + src_stride;
817 420 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
818 210 coeffs_pqrs);
819 210 }
820
821 // Bottom rows
822
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
58 if (KLEIDICV_LIKELY(y_end == src_height)) {
823 82 process_edge_row(src + src_stride * (src_height - 1),
824 41 dst + dst_stride * (dst_height - 4), dst_stride,
825 41 coeffs_ab);
826 41 }
827
828 58 return KLEIDICV_OK;
829 58 }
830
831 174 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc(
832 const float *src, size_t src_stride, size_t src_width, size_t src_height,
833 size_t y_begin, size_t y_end, float *dst,
834 size_t dst_stride) KLEIDICV_STREAMING {
835 174 size_t dst_width = src_width * 8;
836 174 size_t dst_height = src_height * 8;
837 174 src_stride /= sizeof(float);
838 174 dst_stride /= sizeof(float);
839
840 174 svuint32_t indices_0a, indices_0b, indices_1a, indices_1b, indices_2a,
841 indices_2b, indices_3a, indices_3b;
842 {
843 // indices for row 0
844 174 svuint32_t tmp_2x = svreinterpret_u32_u64(svindex_u64(0, 0x100000001UL));
845 174 svuint32_t tmp_4x = svzip1(tmp_2x, tmp_2x); // 0, 0, 0, 0, 1, 1, 1, 1, ...
846 174 indices_0a = svzip1(tmp_4x, tmp_4x); // 8 times 0, then 8 times 1, ...
847 174 indices_1a = svzip2(tmp_4x, tmp_4x);
848 // next section, e.g. in case of 512-bit regs (=16 x F32), it is 4, 4, 4, 4,
849 // 5, 5, 5, 5, ...
850 174 tmp_4x = svzip2(tmp_2x, tmp_2x);
851 174 indices_2a = svzip1(tmp_4x, tmp_4x);
852 174 indices_3a = svzip2(tmp_4x, tmp_4x);
853
854 // same as above, just all numbers are bigger by one (for row 1)
855 174 tmp_2x = svreinterpret_u32_u64(svindex_u64(0x100000001UL, 0x100000001UL));
856 174 tmp_4x = svzip1(tmp_2x, tmp_2x); // 1, 1, 1, 1, ...
857 174 indices_0b = svzip1(tmp_4x, tmp_4x); // 8 times 1, then 8 times 2, ...
858 174 indices_1b = svzip2(tmp_4x, tmp_4x);
859 // next section, e.g. in case of 512-bit regs (=16 x F32), it is 5, 5, 5, 5,
860 // 6, 6, 6, 6, ...
861 174 tmp_4x = svzip2(tmp_2x, tmp_2x);
862 174 indices_2b = svzip1(tmp_4x, tmp_4x);
863 174 indices_3b = svzip2(tmp_4x, tmp_4x);
864 174 }
865 1392 std::reference_wrapper<svuint32_t> indices[8] = {
866 696 indices_0a, indices_0b, indices_1a, indices_1b,
867 696 indices_2a, indices_2b, indices_3a, indices_3b};
868
869 174 svfloat32_t coeffs_a, coeffs_b;
870 {
871 // Prepare 1/16, 3/16, 5/16, ..., 15/16, repeated
872 174 svuint32_t linear = svindex_u32(1, 2);
873 348 svfloat32_t repetitive_float = // mod 16
874 174 svcvt_f32_x(svptrue_b32(), svand_n_u32_m(svptrue_b32(), linear, 0x0F));
875 174 coeffs_b = svdiv_n_f32_x(svptrue_b32(), repetitive_float, 16.0F);
876 174 coeffs_a = svsub_x(svptrue_b32(), svdup_f32(1.0F), coeffs_b);
877 174 }
878 174 std::reference_wrapper<svfloat32_t> coeffs_ab[2] = {coeffs_a, coeffs_b};
879
880 87486 auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
881 svfloat32_t b) KLEIDICV_STREAMING {
882 87312 return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
883 };
884
885 5966 auto index_and_lerp1d = [](svbool_t pg, svuint32_t indices_a,
886 svuint32_t indices_b,
887 std::reference_wrapper<svfloat32_t> coeffs_ab[2],
888 svfloat32_t src) KLEIDICV_STREAMING {
889 11584 return svmla_f32_x(pg, svmul_f32_x(pg, svtbl(src, indices_a), coeffs_ab[0]),
890 5792 svtbl(src, indices_b), coeffs_ab[1]);
891 };
892
893 // Handle top or bottom edge
894 174 auto process_edge_row =
895 420 [src_width, dst_width, index_and_lerp1d](
896 const float *src_row, float *dst_row, size_t dst_stride,
897 std::reference_wrapper<svuint32_t> indices[8],
898 std::reference_wrapper<svfloat32_t> coeffs_ab[2]) KLEIDICV_STREAMING {
899 // Left elements
900 246 float left = src_row[0];
901 246 float *dst = dst_row;
902
2/2
✓ Branch 0 taken 984 times.
✓ Branch 1 taken 246 times.
1230 for (size_t i = 0; i < 4; ++i) {
903 984 *dst++ = left;
904 984 *dst++ = left;
905 984 *dst++ = left;
906 984 *dst = left;
907 984 dst += dst_stride - 3;
908 984 }
909
910 // Middle elements
911
2/2
✓ Branch 0 taken 1448 times.
✓ Branch 1 taken 246 times.
1694 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
912 1448 svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
913 1448 svfloat32_t svsrc = svld1_f32(pg, src_row + src_x);
914
915 1448 size_t dst_length = 8 * (src_width - src_x - 1);
916 1448 svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
917 1448 svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
918 1448 svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
919 1448 svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
920
921 1448 float *dst_row0 = dst_row + src_x * 8 + 4;
922 1448 float *dst_row1 = dst_row0 + dst_stride;
923 1448 float *dst_row2 = dst_row1 + dst_stride;
924 1448 float *dst_row3 = dst_row2 + dst_stride;
925 2896 svfloat32_t dst =
926 1448 index_and_lerp1d(pg_1, indices[0], indices[1], coeffs_ab, svsrc);
927 1448 svst1(pg_1, dst_row0, dst);
928 1448 svst1(pg_1, dst_row1, dst);
929 1448 svst1(pg_1, dst_row2, dst);
930 1448 svst1(pg_1, dst_row3, dst);
931
932 1448 dst =
933 1448 index_and_lerp1d(pg_2, indices[2], indices[3], coeffs_ab, svsrc);
934 1448 svst1_vnum(pg_2, dst_row0, 1, dst);
935 1448 svst1_vnum(pg_2, dst_row1, 1, dst);
936 1448 svst1_vnum(pg_2, dst_row2, 1, dst);
937 1448 svst1_vnum(pg_2, dst_row3, 1, dst);
938
939 1448 dst =
940 1448 index_and_lerp1d(pg_3, indices[4], indices[5], coeffs_ab, svsrc);
941 1448 svst1_vnum(pg_3, dst_row0, 2, dst);
942 1448 svst1_vnum(pg_3, dst_row1, 2, dst);
943 1448 svst1_vnum(pg_3, dst_row2, 2, dst);
944 1448 svst1_vnum(pg_3, dst_row3, 2, dst);
945
946 1448 dst =
947 1448 index_and_lerp1d(pg_4, indices[6], indices[7], coeffs_ab, svsrc);
948 1448 svst1_vnum(pg_4, dst_row0, 3, dst);
949 1448 svst1_vnum(pg_4, dst_row1, 3, dst);
950 1448 svst1_vnum(pg_4, dst_row2, 3, dst);
951 1448 svst1_vnum(pg_4, dst_row3, 3, dst);
952 1448 }
953
954 // Right elements
955 246 dst = dst_row + dst_width - 4;
956 246 float right = src_row[src_width - 1];
957
2/2
✓ Branch 0 taken 246 times.
✓ Branch 1 taken 984 times.
1230 for (size_t i = 0; i < 4; ++i) {
958 984 *dst++ = right;
959 984 *dst++ = right;
960 984 *dst++ = right;
961 984 *dst = right;
962 984 dst += dst_stride - 3;
963 984 }
964 246 };
965
966 174 svfloat32_t coeffs_p = svmul_n_f32_x(svptrue_b32(), coeffs_a, 15.0 / 16);
967 174 svfloat32_t coeffs_q = svmul_n_f32_x(svptrue_b32(), coeffs_b, 15.0 / 16);
968 174 svfloat32_t coeffs_r = svmul_n_f32_x(svptrue_b32(), coeffs_a, 1.0 / 16);
969 174 svfloat32_t coeffs_s = svmul_n_f32_x(svptrue_b32(), coeffs_b, 1.0 / 16);
970 348 std::reference_wrapper<svfloat32_t> coeffs_pqrs[4] = {coeffs_p, coeffs_q,
971 348 coeffs_r, coeffs_s};
972
973 25918 auto index_and_lerp2d = [](svbool_t pg, svuint32_t indices_a,
974 svuint32_t indices_b,
975 std::reference_wrapper<svfloat32_t> coeffs_pqrs[4],
976 svfloat32_t src0,
977 svfloat32_t src1) KLEIDICV_STREAMING {
978 25744 return svmla_f32_x(
979 25744 pg,
980 25744 svmla_f32_x(
981 25744 pg,
982 51488 svmla_f32_x(pg,
983 25744 svmul_f32_x(pg, svtbl(src0, indices_a), coeffs_pqrs[0]),
984 25744 svtbl(src0, indices_b), coeffs_pqrs[1]),
985 25744 svtbl(src1, indices_a), coeffs_pqrs[2]),
986 25744 svtbl(src1, indices_b), coeffs_pqrs[3]);
987 };
988
989 804 auto process_row = [src_width, dst_width, index_and_lerp2d, lerp1d_vector](
990 const float *src_row0, const float *src_row1,
991 float *dst_row, size_t dst_stride,
992 std::reference_wrapper<svuint32_t> indices[8],
993 std::reference_wrapper<svfloat32_t>
994 coeffs_pqrs[4]) KLEIDICV_STREAMING {
995 // Left edge
996 630 svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element
997 630 svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements
998 630 float *dst_lr = dst_row;
999 630 svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
1000 630 svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
1001
2/2
✓ Branch 0 taken 5040 times.
✓ Branch 1 taken 630 times.
5670 for (size_t i = 0; i < 8; ++i) {
1002 10080 svst1(pg4, dst_lr,
1003 10080 lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
1004 5040 static_cast<float>(i * 2 + 1) / 16.0F, s1l));
1005 5040 dst_lr += dst_stride;
1006 5040 }
1007
1008 // Middle elements
1009
2/2
✓ Branch 0 taken 3218 times.
✓ Branch 1 taken 630 times.
3848 for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
1010 3218 size_t dst_x = src_x * 8 + 4;
1011
1012 3218 svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
1013 3218 svfloat32_t src_0 = svld1_f32(pg, src_row0 + src_x);
1014 3218 svfloat32_t src_1 = svld1_f32(pg, src_row1 + src_x);
1015
1016 3218 size_t dst_length = 8 * (src_width - src_x - 1);
1017 3218 svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
1018 3218 svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
1019 3218 svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
1020 3218 svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
1021
1022 3218 float *dst_row0 = dst_row + dst_x;
1023 3218 float *dst_row1 = dst_row0 + dst_stride;
1024 3218 float *dst_row2 = dst_row1 + dst_stride;
1025 3218 float *dst_row3 = dst_row2 + dst_stride;
1026 3218 float *dst_row4 = dst_row3 + dst_stride;
1027 3218 float *dst_row5 = dst_row4 + dst_stride;
1028 3218 float *dst_row6 = dst_row5 + dst_stride;
1029 3218 float *dst_row7 = dst_row6 + dst_stride;
1030
1031 6436 svfloat32_t dst_0 = index_and_lerp2d(pg_1, indices[0], indices[1],
1032 3218 coeffs_pqrs, src_0, src_1);
1033 3218 svst1(pg_1, dst_row0, dst_0);
1034 6436 svfloat32_t dst_7 = index_and_lerp2d(pg_1, indices[0], indices[1],
1035 3218 coeffs_pqrs, src_1, src_0);
1036 3218 svst1(pg_1, dst_row7, dst_7);
1037 6436 svst1(pg_1, dst_row1,
1038 3218 lerp1d_vector(pg_1, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1039 6436 svst1(pg_1, dst_row2,
1040 3218 lerp1d_vector(pg_1, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1041 6436 svst1(pg_1, dst_row3,
1042 3218 lerp1d_vector(pg_1, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1043 6436 svst1(pg_1, dst_row4,
1044 3218 lerp1d_vector(pg_1, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1045 6436 svst1(pg_1, dst_row5,
1046 3218 lerp1d_vector(pg_1, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1047 6436 svst1(pg_1, dst_row6,
1048 3218 lerp1d_vector(pg_1, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1049
1050 6436 dst_0 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_0,
1051 3218 src_1);
1052 3218 svst1_vnum(pg_2, dst_row0, 1, dst_0);
1053 6436 dst_7 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_1,
1054 3218 src_0);
1055 3218 svst1_vnum(pg_2, dst_row7, 1, dst_7);
1056 6436 svst1_vnum(pg_2, dst_row1, 1,
1057 3218 lerp1d_vector(pg_2, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1058 6436 svst1_vnum(pg_2, dst_row2, 1,
1059 3218 lerp1d_vector(pg_2, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1060 6436 svst1_vnum(pg_2, dst_row3, 1,
1061 3218 lerp1d_vector(pg_2, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1062 6436 svst1_vnum(pg_2, dst_row4, 1,
1063 3218 lerp1d_vector(pg_2, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1064 6436 svst1_vnum(pg_2, dst_row5, 1,
1065 3218 lerp1d_vector(pg_2, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1066 6436 svst1_vnum(pg_2, dst_row6, 1,
1067 3218 lerp1d_vector(pg_2, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1068
1069 6436 dst_0 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_0,
1070 3218 src_1);
1071 3218 svst1_vnum(pg_3, dst_row0, 2, dst_0);
1072 6436 dst_7 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_1,
1073 3218 src_0);
1074 3218 svst1_vnum(pg_3, dst_row7, 2, dst_7);
1075 6436 svst1_vnum(pg_3, dst_row1, 2,
1076 3218 lerp1d_vector(pg_3, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1077 6436 svst1_vnum(pg_3, dst_row2, 2,
1078 3218 lerp1d_vector(pg_3, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1079 6436 svst1_vnum(pg_3, dst_row3, 2,
1080 3218 lerp1d_vector(pg_3, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1081 6436 svst1_vnum(pg_3, dst_row4, 2,
1082 3218 lerp1d_vector(pg_3, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1083 6436 svst1_vnum(pg_3, dst_row5, 2,
1084 3218 lerp1d_vector(pg_3, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1085 6436 svst1_vnum(pg_3, dst_row6, 2,
1086 3218 lerp1d_vector(pg_3, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1087
1088 6436 dst_0 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_0,
1089 3218 src_1);
1090 3218 svst1_vnum(pg_4, dst_row0, 3, dst_0);
1091 6436 dst_7 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_1,
1092 3218 src_0);
1093 3218 svst1_vnum(pg_4, dst_row7, 3, dst_7);
1094 6436 svst1_vnum(pg_4, dst_row1, 3,
1095 3218 lerp1d_vector(pg_4, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
1096 6436 svst1_vnum(pg_4, dst_row2, 3,
1097 3218 lerp1d_vector(pg_4, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
1098 6436 svst1_vnum(pg_4, dst_row3, 3,
1099 3218 lerp1d_vector(pg_4, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
1100 6436 svst1_vnum(pg_4, dst_row4, 3,
1101 3218 lerp1d_vector(pg_4, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
1102 6436 svst1_vnum(pg_4, dst_row5, 3,
1103 3218 lerp1d_vector(pg_4, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
1104 6436 svst1_vnum(pg_4, dst_row6, 3,
1105 3218 lerp1d_vector(pg_4, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
1106 3218 }
1107
1108 // Right edge
1109 630 dst_lr = dst_row;
1110 630 svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
1111 630 svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
1112
2/2
✓ Branch 0 taken 630 times.
✓ Branch 1 taken 5040 times.
5670 for (size_t i = 0; i < 8; ++i) {
1113 10080 svst1(pg4, dst_lr + dst_width - 4,
1114 10080 lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
1115 5040 static_cast<float>(i * 2 + 1) / 16.0F, s1r));
1116 5040 dst_lr += dst_stride;
1117 5040 }
1118 630 };
1119
1120 // Top rows
1121
2/2
✓ Branch 0 taken 51 times.
✓ Branch 1 taken 123 times.
174 if (KLEIDICV_LIKELY(y_begin == 0)) {
1122 123 process_edge_row(src, dst, dst_stride, indices, coeffs_ab);
1123 123 }
1124
1125 // Middle rows
1126
2/2
✓ Branch 0 taken 630 times.
✓ Branch 1 taken 174 times.
804 for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
1127 630 size_t dst_y = src_y * 8 + 4;
1128 630 const float *src_row0 = src + src_stride * src_y;
1129 630 const float *src_row1 = src_row0 + src_stride;
1130 1260 process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
1131 630 indices, coeffs_pqrs);
1132 630 }
1133
1134 // Bottom rows
1135
2/2
✓ Branch 0 taken 51 times.
✓ Branch 1 taken 123 times.
174 if (KLEIDICV_LIKELY(y_end == src_height)) {
1136 246 process_edge_row(src + src_stride * (src_height - 1),
1137 123 dst + dst_stride * (dst_height - 4), dst_stride, indices,
1138 123 coeffs_ab);
1139 123 }
1140
1141 174 return KLEIDICV_OK;
1142 174 }
1143
1144 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t
1145 784 kleidicv_resize_linear_stripe_f32_sc(const float *src, size_t src_stride,
1146 size_t src_width, size_t src_height,
1147 size_t y_begin, size_t y_end, float *dst,
1148 size_t dst_stride, size_t dst_width,
1149 size_t dst_height) KLEIDICV_STREAMING {
1150
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 780 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 780 times.
784 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
1151
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 776 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 776 times.
780 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
1152
1153
2/4
✓ Branch 0 taken 776 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 776 times.
776 if (src_width == 0 || src_height == 0) {
1154 return KLEIDICV_OK;
1155 }
1156
3/4
✓ Branch 0 taken 296 times.
✓ Branch 1 taken 480 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 296 times.
776 if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
1157 592 return resize_2x2_f32_sc(src, src_stride, src_width, src_height, y_begin,
1158 296 y_end, dst, dst_stride);
1159 }
1160
3/4
✓ Branch 0 taken 248 times.
✓ Branch 1 taken 232 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 248 times.
480 if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
1161 496 return resize_4x4_f32_sc(src, src_stride, src_width, src_height, y_begin,
1162 248 y_end, dst, dst_stride);
1163 }
1164
2/4
✓ Branch 0 taken 232 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 232 times.
232 if (src_width * 8 == dst_width && src_height * 8 == dst_height) {
1165
2/2
✓ Branch 0 taken 174 times.
✓ Branch 1 taken 58 times.
232 if (svcntw() >= 8) {
1166 348 return resize_8x8_f32_sve256plus_sc(src, src_stride, src_width,
1167 174 src_height, y_begin, y_end, dst,
1168 174 dst_stride);
1169 }
1170 116 return resize_8x8_f32_sve128_sc(src, src_stride, src_width, src_height,
1171 58 y_begin, y_end, dst, dst_stride);
1172 }
1173 // resize_linear_f32_is_implemented checked the kernel size already.
1174 // GCOVR_EXCL_START
1175 assert(!"resize ratio not implemented");
1176 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1177 // GCOVR_EXCL_STOP
1178 784 }
1179
1180 } // namespace KLEIDICV_TARGET_NAMESPACE
1181
1182 #endif // KLEIDICV_RESIZE_SC_H
1183