Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cassert> | ||
6 | |||
7 | #include "kleidicv/kleidicv.h" | ||
8 | #include "kleidicv/neon.h" | ||
9 | #include "kleidicv/operations.h" | ||
10 | #include "kleidicv/resize/resize_linear.h" | ||
11 | |||
12 | namespace kleidicv::neon { | ||
13 | |||
14 | template <uint8_t P, uint8_t Q, uint8_t Bias, uint8_t Shift> | ||
15 | 8848 | uint8x8_t lerp2d_vector_p_q_q_1(uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
16 | uint8x8_t d) { | ||
17 | // b + c | ||
18 | 8848 | uint16x8_t b_c = vaddl_u8(b, c); | |
19 | |||
20 | // a * p | ||
21 | 8848 | uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); | |
22 | |||
23 | // a * p + (b + c) * q | ||
24 | 8848 | uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); | |
25 | |||
26 | // d + bias | ||
27 | 8848 | uint16x8_t d_bias = vaddl_u8(d, vdup_n_u8(Bias)); | |
28 | |||
29 | // a * p + (b + c) * q + d + bias | ||
30 | 8848 | uint16x8_t ap_bcq_d_bias = vaddq_u16(ap_bcq, d_bias); | |
31 | |||
32 | // (a * p + (b + c) * q + d + bias) >> shift | ||
33 | 8848 | uint8x8_t result = vshrn_n_u16(ap_bcq_d_bias, Shift); | |
34 | 17696 | return result; | |
35 | 8848 | } | |
36 | |||
37 | template <uint8_t P, uint8_t Q, uint8_t R, uint8_t Bias, uint8_t Shift> | ||
38 | 4424 | uint8x8_t lerp2d_vector_p_q_q_r(uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
39 | uint8x8_t d) { | ||
40 | // b + c | ||
41 | 4424 | uint16x8_t b_c = vaddl_u8(b, c); | |
42 | |||
43 | // a * p | ||
44 | 4424 | uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); | |
45 | |||
46 | // d * r | ||
47 | 4424 | uint16x8_t dr = vmull_u8(d, vdup_n_u8(R)); | |
48 | |||
49 | // a * p + (b + c) * q | ||
50 | 4424 | uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); | |
51 | |||
52 | // d * r + bias | ||
53 | 4424 | uint16x8_t dr_bias = vaddq_u16(dr, vdupq_n_u16(Bias)); | |
54 | |||
55 | // a * p + (b + c) * q + d * r + bias | ||
56 | 4424 | uint16x8_t ap_bcq_dr_bias = vaddq_u16(ap_bcq, dr_bias); | |
57 | |||
58 | // (a * p + (b + c) * q + d * r + bias) >> shift | ||
59 | 4424 | uint8x8_t result = vshrn_n_u16(ap_bcq_dr_bias, Shift); | |
60 | 8848 | return result; | |
61 | 4424 | } | |
62 | |||
63 | 54 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8( | |
64 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
65 | size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { | ||
66 | 54 | size_t dst_width = src_width * 2; | |
67 | |||
68 | 926 | auto lerp1d_scalar = [](uint8_t near, uint8_t far) { | |
69 | 872 | return (near * 3 + far + 2) >> 2; | |
70 | }; | ||
71 | |||
72 | 1150 | auto lerp1d_vector = [](uint8x8_t near, uint8x8_t far) { | |
73 | 1096 | uint8x8_t three = vdup_n_u8(3); | |
74 | 1096 | uint8x8_t two = vdup_n_u8(2); | |
75 | |||
76 | // near * 3 | ||
77 | 1096 | uint16x8_t near3 = vmull_u8(near, three); | |
78 | |||
79 | // far + 2 | ||
80 | 1096 | uint16x8_t far_2 = vaddl_u8(far, two); | |
81 | |||
82 | // near * 3 + far * 2 | ||
83 | 1096 | uint16x8_t near3_far_2 = vaddq_u16(near3, far_2); | |
84 | |||
85 | // (near * 3 + far * 2) / 4 | ||
86 | 1096 | uint8x8_t near3_far_2_div4 = vshrn_n_u16(near3_far_2, 2); | |
87 | |||
88 | 2192 | return near3_far_2_div4; | |
89 | 1096 | }; | |
90 | |||
91 | 1382 | auto lerp2d_scalar = [](uint8_t near, uint8_t mid_a, uint8_t mid_b, | |
92 | uint8_t far) { | ||
93 | 1328 | return (near * 9 + (mid_a + mid_b) * 3 + far + 8) >> 4; | |
94 | }; | ||
95 | |||
96 | 4478 | auto lerp2d_vector = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, uint8x8_t d) { | |
97 | 4424 | return lerp2d_vector_p_q_q_1<9, 3, 8, 4>(a, b, c, d); | |
98 | }; | ||
99 | |||
100 | // Handle top or bottom edge | ||
101 | 160 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
102 | const uint8_t *src_row, uint8_t *dst_row) { | ||
103 | // Left element | ||
104 | 106 | dst_row[0] = src_row[0]; | |
105 | |||
106 | // Right element | ||
107 | 106 | dst_row[dst_width - 1] = src_row[src_width - 1]; | |
108 | |||
109 | // Middle elements | ||
110 | 106 | size_t src_x = 0; | |
111 |
2/2✓ Branch 0 taken 548 times.
✓ Branch 1 taken 106 times.
|
654 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
112 | 548 | size_t dst_x = src_x * 2 + 1; | |
113 | 548 | uint8x8_t src_left = vld1_u8(src_row + src_x); | |
114 | 548 | uint8x8_t src_right = vld1_u8(src_row + src_x + 1); | |
115 | |||
116 | 548 | uint8x8_t dst_left = lerp1d_vector(src_left, src_right); | |
117 | 548 | uint8x8_t dst_right = lerp1d_vector(src_right, src_left); | |
118 | |||
119 | 548 | vst2_u8(dst_row + dst_x, (uint8x8x2_t{dst_left, dst_right})); | |
120 | 548 | } | |
121 |
2/2✓ Branch 0 taken 140 times.
✓ Branch 1 taken 106 times.
|
246 | for (; src_x + 1 < src_width; ++src_x) { |
122 | 140 | size_t dst_x = src_x * 2 + 1; | |
123 | 140 | const uint8_t src_left = src_row[src_x], src_right = src_row[src_x + 1]; | |
124 | 140 | dst_row[dst_x] = lerp1d_scalar(src_left, src_right); | |
125 | 140 | dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left); | |
126 | 140 | } | |
127 | 106 | }; | |
128 | |||
129 | 202 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
130 | lerp2d_vector](const uint8_t *src_row0, | ||
131 | const uint8_t *src_row1, uint8_t *dst_row0, | ||
132 | uint8_t *dst_row1) { | ||
133 | // Left element | ||
134 | 148 | dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); | |
135 | 148 | dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); | |
136 | |||
137 | // Right element | ||
138 | 148 | dst_row0[dst_width - 1] = | |
139 | 148 | lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); | |
140 | 148 | dst_row1[dst_width - 1] = | |
141 | 148 | lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); | |
142 | |||
143 | // Middle elements | ||
144 | 148 | size_t src_x = 0; | |
145 |
2/2✓ Branch 0 taken 1106 times.
✓ Branch 1 taken 148 times.
|
1254 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
146 | 1106 | size_t dst_x = src_x * 2 + 1; | |
147 | |||
148 | 1106 | uint8x8_t src_tl = vld1_u8(src_row0 + src_x); | |
149 | 1106 | uint8x8_t src_tr = vld1_u8(src_row0 + src_x + 1); | |
150 | 1106 | uint8x8_t src_bl = vld1_u8(src_row1 + src_x); | |
151 | 1106 | uint8x8_t src_br = vld1_u8(src_row1 + src_x + 1); | |
152 | |||
153 | 1106 | uint8x8_t dst_tl = lerp2d_vector(src_tl, src_tr, src_bl, src_br); | |
154 | 1106 | uint8x8_t dst_tr = lerp2d_vector(src_tr, src_tl, src_br, src_bl); | |
155 | 1106 | uint8x8_t dst_bl = lerp2d_vector(src_bl, src_tl, src_br, src_tr); | |
156 | 1106 | uint8x8_t dst_br = lerp2d_vector(src_br, src_tr, src_bl, src_tl); | |
157 | |||
158 | 1106 | vst2_u8(dst_row0 + dst_x, (uint8x8x2_t{dst_tl, dst_tr})); | |
159 | 1106 | vst2_u8(dst_row1 + dst_x, (uint8x8x2_t{dst_bl, dst_br})); | |
160 | 1106 | } | |
161 |
2/2✓ Branch 0 taken 332 times.
✓ Branch 1 taken 148 times.
|
480 | for (; src_x + 1 < src_width; ++src_x) { |
162 | 332 | size_t dst_x = src_x * 2 + 1; | |
163 | 664 | const uint8_t src_tl = src_row0[src_x], src_tr = src_row0[src_x + 1], | |
164 | 664 | src_bl = src_row1[src_x], src_br = src_row1[src_x + 1]; | |
165 | 332 | dst_row0[dst_x] = lerp2d_scalar(src_tl, src_tr, src_bl, src_br); | |
166 | 332 | dst_row0[dst_x + 1] = lerp2d_scalar(src_tr, src_tl, src_br, src_bl); | |
167 | 332 | dst_row1[dst_x] = lerp2d_scalar(src_bl, src_tl, src_br, src_tr); | |
168 | 332 | dst_row1[dst_x + 1] = lerp2d_scalar(src_br, src_tr, src_bl, src_tl); | |
169 | 332 | } | |
170 | 148 | }; | |
171 | |||
172 | // Top row | ||
173 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 53 times.
|
54 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
174 | 53 | process_edge_row(src, dst); | |
175 | 53 | } | |
176 | |||
177 | // Middle rows | ||
178 |
2/2✓ Branch 0 taken 148 times.
✓ Branch 1 taken 54 times.
|
202 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
179 | 148 | size_t dst_y = src_y * 2 + 1; | |
180 | 148 | const uint8_t *src_row0 = src + src_stride * src_y; | |
181 | 148 | const uint8_t *src_row1 = src_row0 + src_stride; | |
182 | 148 | uint8_t *dst_row0 = dst + dst_stride * dst_y; | |
183 | 148 | uint8_t *dst_row1 = dst_row0 + dst_stride; | |
184 | |||
185 | 148 | process_row(src_row0, src_row1, dst_row0, dst_row1); | |
186 | 148 | } | |
187 | |||
188 | // Bottom row | ||
189 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 53 times.
|
54 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
190 | 106 | process_edge_row(src + src_stride * (src_height - 1), | |
191 | 53 | dst + dst_stride * (src_height * 2 - 1)); | |
192 | 53 | } | |
193 | |||
194 | 54 | return KLEIDICV_OK; | |
195 | 54 | } | |
196 | |||
197 | 34 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8( | |
198 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
199 | size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { | ||
200 | 34 | size_t dst_width = src_width * 4, dst_height = src_height * 4; | |
201 | |||
202 | 1394 | auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, | |
203 | uint8_t b) { | ||
204 | 1360 | return (coeff_a * a + coeff_b * b + 4) >> 3; | |
205 | }; | ||
206 | 2226 | auto lerp1d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, | |
207 | uint8_t coeff_b_scalar, uint8x8_t b) { | ||
208 | 2192 | uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); | |
209 | 2192 | uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); | |
210 | 2192 | uint16x8_t four = vdupq_n_u16(4); | |
211 | |||
212 | // a * coeff_a | ||
213 | 2192 | uint16x8_t a1 = vmull_u8(a, coeff_a); | |
214 | |||
215 | // b * coeff_b | ||
216 | 2192 | uint16x8_t b1 = vmull_u8(b, coeff_b); | |
217 | |||
218 | // a * coeff_a + b * coeff_b | ||
219 | 2192 | uint16x8_t a1_b1 = vaddq_u16(a1, b1); | |
220 | |||
221 | // a * coeff_a + b * coeff_b + 4 | ||
222 | 2192 | uint16x8_t a1_b1_4 = vaddq_u16(a1_b1, four); | |
223 | |||
224 | // (a * coeff_a + b * coeff_b + 4) / 8 | ||
225 | 2192 | uint8x8_t result = vshrn_n_u16(a1_b1_4, 3); | |
226 | |||
227 | 4384 | return result; | |
228 | 2192 | }; | |
229 | 4514 | auto lerp2d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, | |
230 | uint8_t b, uint8_t coeff_c, uint8_t c, | ||
231 | uint8_t coeff_d, uint8_t d) { | ||
232 | 4480 | return (coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d + 32) >> 6; | |
233 | }; | ||
234 | 8882 | auto lerp2d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, | |
235 | uint8_t coeff_b_scalar, uint8x8_t b, | ||
236 | uint8_t coeff_c_scalar, uint8x8_t c, | ||
237 | uint8_t coeff_d_scalar, uint8x8_t d) { | ||
238 | 8848 | uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); | |
239 | 8848 | uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); | |
240 | 8848 | uint8x8_t coeff_c = vdup_n_u8(coeff_c_scalar); | |
241 | 8848 | uint8x8_t coeff_d = vdup_n_u8(coeff_d_scalar); | |
242 | 8848 | uint16x8_t thirtytwo = vdupq_n_u16(32); | |
243 | |||
244 | // a * coeff_a | ||
245 | 8848 | uint16x8_t a1 = vmull_u8(a, coeff_a); | |
246 | |||
247 | // b * coeff_b | ||
248 | 8848 | uint16x8_t b1 = vmull_u8(b, coeff_b); | |
249 | |||
250 | // c * coeff_c | ||
251 | 8848 | uint16x8_t c1 = vmull_u8(c, coeff_c); | |
252 | |||
253 | // d * coeff_d | ||
254 | 8848 | uint16x8_t d1 = vmull_u8(d, coeff_d); | |
255 | |||
256 | // a * coeff_a + b * coeff_b | ||
257 | 8848 | uint16x8_t a1_b1 = vaddq_u16(a1, b1); | |
258 | |||
259 | // c * coeff_c + d * coeff_d | ||
260 | 8848 | uint16x8_t c1_d1 = vaddq_u16(c1, d1); | |
261 | |||
262 | // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d | ||
263 | 8848 | uint16x8_t a1_b1_c1_d1 = vaddq_u16(a1_b1, c1_d1); | |
264 | |||
265 | // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32 | ||
266 | 8848 | uint16x8_t a1_b1_c1_d1_32 = vaddq_u16(a1_b1_c1_d1, thirtytwo); | |
267 | |||
268 | // (a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32) / 64 | ||
269 | 8848 | uint8x8_t result = vshrn_n_u16(a1_b1_c1_d1_32, 6); | |
270 | 17696 | return result; | |
271 | 8848 | }; | |
272 | // Handle top or bottom edge | ||
273 | 100 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
274 | const uint8_t *src_row, uint8_t *dst_row) { | ||
275 | // Left elements | ||
276 | 66 | dst_row[1] = dst_row[0] = src_row[0]; | |
277 | |||
278 | // Right elements | ||
279 | 66 | dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; | |
280 | |||
281 | // Middle elements | ||
282 | 66 | size_t src_x = 0; | |
283 |
2/2✓ Branch 0 taken 548 times.
✓ Branch 1 taken 66 times.
|
614 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
284 | 548 | size_t dst_x = src_x * 4 + 2; | |
285 | 548 | uint8x8_t a = vld1_u8(src_row + src_x); | |
286 | 548 | uint8x8_t b = vld1_u8(src_row + src_x + 1); | |
287 | 548 | uint8x8x4_t interpolated = { | |
288 | 1644 | lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b), | |
289 | 1096 | lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)}; | |
290 | |||
291 | 548 | vst4_u8(dst_row + dst_x, interpolated); | |
292 | 548 | } | |
293 |
2/2✓ Branch 0 taken 84 times.
✓ Branch 1 taken 66 times.
|
150 | for (; src_x + 1 < src_width; ++src_x) { |
294 | 84 | size_t dst_x = src_x * 4 + 2; | |
295 | 84 | const uint8_t a = src_row[src_x], b = src_row[src_x + 1]; | |
296 | 84 | dst_row[dst_x + 0] = lerp1d_scalar(7, a, 1, b); | |
297 | 84 | dst_row[dst_x + 1] = lerp1d_scalar(5, a, 3, b); | |
298 | 84 | dst_row[dst_x + 2] = lerp1d_scalar(3, a, 5, b); | |
299 | 84 | dst_row[dst_x + 3] = lerp1d_scalar(1, a, 7, b); | |
300 | 84 | } | |
301 | 66 | }; | |
302 | |||
303 | 162 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
304 | lerp2d_vector](const uint8_t *src_row0, | ||
305 | const uint8_t *src_row1, uint8_t *dst_row0, | ||
306 | uint8_t *dst_row1, uint8_t *dst_row2, | ||
307 | uint8_t *dst_row3) { | ||
308 | 4552 | auto lerp2d_vector_49_7_7_1 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
309 | uint8x8_t d) { | ||
310 | 4424 | return lerp2d_vector_p_q_q_1<49, 7, 32, 6>(a, b, c, d); | |
311 | }; | ||
312 | 4552 | auto lerp2d_vector_25_15_15_9 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
313 | uint8x8_t d) { | ||
314 | 4424 | return lerp2d_vector_p_q_q_r<25, 15, 9, 32, 6>(a, b, c, d); | |
315 | }; | ||
316 | |||
317 | // Left elements | ||
318 | 128 | const uint8_t s0l = src_row0[0], s1l = src_row1[0]; | |
319 | 128 | dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); | |
320 | 128 | dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); | |
321 | 128 | dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); | |
322 | 128 | dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); | |
323 | |||
324 | // Right elements | ||
325 | 128 | const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; | |
326 | 128 | const size_t dr0 = dst_width - 2; | |
327 | 128 | const size_t dr1 = dst_width - 1; | |
328 | 128 | dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); | |
329 | 128 | dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); | |
330 | 128 | dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); | |
331 | 128 | dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); | |
332 | |||
333 | // Middle elements | ||
334 | 128 | size_t src_x = 0; | |
335 |
2/2✓ Branch 0 taken 1106 times.
✓ Branch 1 taken 128 times.
|
1234 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
336 | 1106 | size_t dst_x = src_x * 4 + 2; | |
337 | |||
338 | 1106 | uint8x8_t a = vld1_u8(src_row0 + src_x); | |
339 | 1106 | uint8x8_t b = vld1_u8(src_row0 + src_x + 1); | |
340 | 1106 | uint8x8_t c = vld1_u8(src_row1 + src_x); | |
341 | 1106 | uint8x8_t d = vld1_u8(src_row1 + src_x + 1); | |
342 | |||
343 | 1106 | vst4_u8(dst_row0 + dst_x, (uint8x8x4_t{ | |
344 | lerp2d_vector_49_7_7_1(a, b, c, d), | ||
345 | lerp2d_vector(35, a, 21, b, 5, c, 3, d), | ||
346 | lerp2d_vector(21, a, 35, b, 3, c, 5, d), | ||
347 | lerp2d_vector_49_7_7_1(b, a, d, c), | ||
348 | })); | ||
349 | 1106 | vst4_u8(dst_row1 + dst_x, (uint8x8x4_t{ | |
350 | lerp2d_vector(35, a, 5, b, 21, c, 3, d), | ||
351 | lerp2d_vector_25_15_15_9(a, b, c, d), | ||
352 | lerp2d_vector_25_15_15_9(b, a, d, c), | ||
353 | lerp2d_vector(5, a, 35, b, 3, c, 21, d), | ||
354 | })); | ||
355 | 1106 | vst4_u8(dst_row2 + dst_x, (uint8x8x4_t{ | |
356 | lerp2d_vector(21, a, 3, b, 35, c, 5, d), | ||
357 | lerp2d_vector_25_15_15_9(c, a, d, b), | ||
358 | lerp2d_vector_25_15_15_9(d, b, c, a), | ||
359 | lerp2d_vector(3, a, 21, b, 5, c, 35, d), | ||
360 | })); | ||
361 | 1106 | vst4_u8(dst_row3 + dst_x, (uint8x8x4_t{ | |
362 | lerp2d_vector_49_7_7_1(c, a, d, b), | ||
363 | lerp2d_vector(5, a, 3, b, 35, c, 21, d), | ||
364 | lerp2d_vector(3, a, 5, b, 21, c, 35, d), | ||
365 | lerp2d_vector_49_7_7_1(d, b, c, a), | ||
366 | })); | ||
367 | 1106 | } | |
368 |
2/2✓ Branch 0 taken 280 times.
✓ Branch 1 taken 128 times.
|
408 | for (; src_x + 1 < src_width; ++src_x) { |
369 | 280 | size_t dst_x = src_x * 4 + 2; | |
370 | 560 | const uint8_t a = src_row0[src_x], b = src_row0[src_x + 1], | |
371 | 560 | c = src_row1[src_x], d = src_row1[src_x + 1]; | |
372 | |||
373 | 280 | dst_row0[dst_x + 0] = lerp2d_scalar(49, a, 7, b, 7, c, 1, d); | |
374 | 280 | dst_row0[dst_x + 1] = lerp2d_scalar(35, a, 21, b, 5, c, 3, d); | |
375 | 280 | dst_row0[dst_x + 2] = lerp2d_scalar(21, a, 35, b, 3, c, 5, d); | |
376 | 280 | dst_row0[dst_x + 3] = lerp2d_scalar(7, a, 49, b, 1, c, 7, d); | |
377 | 280 | dst_row1[dst_x + 0] = lerp2d_scalar(35, a, 5, b, 21, c, 3, d); | |
378 | 280 | dst_row1[dst_x + 1] = lerp2d_scalar(25, a, 15, b, 15, c, 9, d); | |
379 | 280 | dst_row1[dst_x + 2] = lerp2d_scalar(15, a, 25, b, 9, c, 15, d); | |
380 | 280 | dst_row1[dst_x + 3] = lerp2d_scalar(5, a, 35, b, 3, c, 21, d); | |
381 | 280 | dst_row2[dst_x + 0] = lerp2d_scalar(21, a, 3, b, 35, c, 5, d); | |
382 | 280 | dst_row2[dst_x + 1] = lerp2d_scalar(15, a, 9, b, 25, c, 15, d); | |
383 | 280 | dst_row2[dst_x + 2] = lerp2d_scalar(9, a, 15, b, 15, c, 25, d); | |
384 | 280 | dst_row2[dst_x + 3] = lerp2d_scalar(3, a, 21, b, 5, c, 35, d); | |
385 | 280 | dst_row3[dst_x + 0] = lerp2d_scalar(7, a, 1, b, 49, c, 7, d); | |
386 | 280 | dst_row3[dst_x + 1] = lerp2d_scalar(5, a, 3, b, 35, c, 21, d); | |
387 | 280 | dst_row3[dst_x + 2] = lerp2d_scalar(3, a, 5, b, 21, c, 35, d); | |
388 | 280 | dst_row3[dst_x + 3] = lerp2d_scalar(1, a, 7, b, 7, c, 49, d); | |
389 | 280 | } | |
390 | 128 | }; | |
391 | |||
392 | // Top rows | ||
393 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
394 | 33 | process_edge_row(src, dst); | |
395 | 33 | memcpy(dst + dst_stride, dst, dst_stride); | |
396 | 33 | } | |
397 | |||
398 | // Middle rows | ||
399 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
|
162 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
400 | 128 | size_t dst_y = src_y * 4 + 2; | |
401 | 128 | const uint8_t *src_row0 = src + src_stride * src_y; | |
402 | 128 | const uint8_t *src_row1 = src_row0 + src_stride; | |
403 | 128 | uint8_t *dst_row0 = dst + dst_stride * dst_y; | |
404 | 128 | uint8_t *dst_row1 = dst_row0 + dst_stride; | |
405 | 128 | uint8_t *dst_row2 = dst_row1 + dst_stride; | |
406 | 128 | uint8_t *dst_row3 = dst_row2 + dst_stride; | |
407 | |||
408 | 128 | process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); | |
409 | 128 | } | |
410 | |||
411 | // Bottom rows | ||
412 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
413 | 66 | process_edge_row(src + src_stride * (src_height - 1), | |
414 | 33 | dst + dst_stride * (dst_height - 2)); | |
415 | 99 | memcpy(dst + dst_stride * (dst_height - 1), | |
416 | 66 | dst + dst_stride * (dst_height - 2), dst_stride); | |
417 | 33 | } | |
418 | |||
419 | 34 | return KLEIDICV_OK; | |
420 | 34 | } | |
421 | |||
422 | KLEIDICV_TARGET_FN_ATTRS | ||
423 | 95 | kleidicv_error_t resize_linear_stripe_u8(const uint8_t *src, size_t src_stride, | |
424 | size_t src_width, size_t src_height, | ||
425 | size_t y_begin, size_t y_end, | ||
426 | uint8_t *dst, size_t dst_stride, | ||
427 | size_t dst_width, size_t dst_height) { | ||
428 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 94 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 94 times.
|
95 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
429 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 93 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 93 times.
|
94 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
430 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 92 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 91 times.
|
93 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
431 | |||
432 |
4/4✓ Branch 0 taken 89 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 88 times.
|
91 | if (src_width == 0 || src_height == 0) { |
433 | 3 | return KLEIDICV_OK; | |
434 | } | ||
435 |
3/4✓ Branch 0 taken 54 times.
✓ Branch 1 taken 34 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.
|
88 | if (src_width * 2 == dst_width && src_height * 2 == dst_height) { |
436 | 108 | return resize_2x2_u8(src, src_stride, src_width, src_height, y_begin, y_end, | |
437 | 54 | dst, dst_stride); | |
438 | } | ||
439 |
2/4✓ Branch 0 taken 34 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 34 times.
|
34 | if (src_width * 4 == dst_width && src_height * 4 == dst_height) { |
440 | 68 | return resize_4x4_u8(src, src_stride, src_width, src_height, y_begin, y_end, | |
441 | 34 | dst, dst_stride); | |
442 | } | ||
443 | // resize_linear_u8_is_implemented checked the kernel size already. | ||
444 | // GCOVR_EXCL_START | ||
445 | assert(!"resize ratio not implemented"); | ||
446 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
447 | // GCOVR_EXCL_STOP | ||
448 | 95 | } | |
449 | |||
450 | 50 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32( | |
451 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
452 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
453 | 50 | size_t dst_width = src_width * 2; | |
454 | 50 | src_stride /= sizeof(float); | |
455 | 50 | dst_stride /= sizeof(float); | |
456 | |||
457 | 874 | auto lerp1d_scalar = [](float near, float far) { | |
458 | 824 | return near * 0.75F + far * 0.25F; | |
459 | }; | ||
460 | |||
461 | 2250 | auto lerp1d_vector = [](float32x4_t near, float32x4_t far) { | |
462 | 2200 | return vmlaq_n_f32(vmulq_n_f32(near, 0.75F), far, 0.25F); | |
463 | }; | ||
464 | |||
465 | 1282 | auto lerp2d_scalar = [](float near, float mid_a, float mid_b, float far) { | |
466 | 1232 | return near * 0.5625F + mid_a * 0.1875F + mid_b * 0.1875F + far * 0.0625F; | |
467 | }; | ||
468 | |||
469 | 8922 | auto lerp2d_vector = [](float32x4_t a, float32x4_t b, float32x4_t c, | |
470 | float32x4_t d) { | ||
471 | 8872 | return vmlaq_n_f32( | |
472 | 8872 | vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, 0.5625F), b, 0.1875F), c, | |
473 | 0.1875F), | ||
474 | 8872 | d, 0.0625F); | |
475 | }; | ||
476 | |||
477 | // Handle top or bottom edge | ||
478 | 148 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
479 | const float *src_row, float *dst_row) { | ||
480 | // Left element | ||
481 | 98 | dst_row[0] = src_row[0]; | |
482 | |||
483 | // Right element | ||
484 | 98 | dst_row[dst_width - 1] = src_row[src_width - 1]; | |
485 | |||
486 | // Middle elements | ||
487 | 98 | size_t src_x = 0; | |
488 |
2/2✓ Branch 0 taken 1100 times.
✓ Branch 1 taken 98 times.
|
1198 | for (; src_x + 4 < src_width; src_x += 4) { |
489 | 1100 | size_t dst_x = src_x * 2 + 1; | |
490 | 1100 | float32x4_t src_left = vld1q_f32(src_row + src_x); | |
491 | 1100 | float32x4_t src_right = vld1q_f32(src_row + src_x + 1); | |
492 | |||
493 | 1100 | float32x4_t dst_left = lerp1d_vector(src_left, src_right); | |
494 | 1100 | float32x4_t dst_right = lerp1d_vector(src_right, src_left); | |
495 | |||
496 | 1100 | vst2q_f32(dst_row + dst_x, (float32x4x2_t{dst_left, dst_right})); | |
497 | 1100 | } | |
498 |
2/2✓ Branch 0 taken 116 times.
✓ Branch 1 taken 98 times.
|
214 | for (; src_x + 1 < src_width; ++src_x) { |
499 | 116 | size_t dst_x = src_x * 2 + 1; | |
500 | 116 | const float src_left = src_row[src_x], src_right = src_row[src_x + 1]; | |
501 | 116 | dst_row[dst_x] = lerp1d_scalar(src_left, src_right); | |
502 | 116 | dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left); | |
503 | 116 | } | |
504 | 98 | }; | |
505 | |||
506 | 198 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
507 | lerp2d_vector](const float *src_row0, | ||
508 | const float *src_row1, float *dst_row0, | ||
509 | float *dst_row1) { | ||
510 | // Left element | ||
511 | 148 | dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); | |
512 | 148 | dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); | |
513 | |||
514 | // Right element | ||
515 | 148 | dst_row0[dst_width - 1] = | |
516 | 148 | lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); | |
517 | 148 | dst_row1[dst_width - 1] = | |
518 | 148 | lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); | |
519 | |||
520 | // Middle elements | ||
521 | 148 | size_t src_x = 0; | |
522 |
2/2✓ Branch 0 taken 2218 times.
✓ Branch 1 taken 148 times.
|
2366 | for (; src_x + 4 < src_width; src_x += 4) { |
523 | 2218 | size_t dst_x = src_x * 2 + 1; | |
524 | |||
525 | 2218 | float32x4_t a = vld1q_f32(src_row0 + src_x); | |
526 | 2218 | float32x4_t b = vld1q_f32(src_row0 + src_x + 1); | |
527 | 2218 | float32x4_t c = vld1q_f32(src_row1 + src_x); | |
528 | 2218 | float32x4_t d = vld1q_f32(src_row1 + src_x + 1); | |
529 | |||
530 | 2218 | vst2q_f32(dst_row0 + dst_x, (float32x4x2_t{lerp2d_vector(a, b, c, d), | |
531 | lerp2d_vector(b, a, d, c)})); | ||
532 | 2218 | vst2q_f32(dst_row1 + dst_x, (float32x4x2_t{lerp2d_vector(c, a, d, b), | |
533 | lerp2d_vector(d, b, c, a)})); | ||
534 | 2218 | } | |
535 |
2/2✓ Branch 0 taken 308 times.
✓ Branch 1 taken 148 times.
|
456 | for (; src_x + 1 < src_width; ++src_x) { |
536 | 308 | size_t dst_x = src_x * 2 + 1; | |
537 | 616 | const float a = src_row0[src_x], b = src_row0[src_x + 1], | |
538 | 616 | c = src_row1[src_x], d = src_row1[src_x + 1]; | |
539 | 308 | dst_row0[dst_x] = lerp2d_scalar(a, b, c, d); | |
540 | 308 | dst_row0[dst_x + 1] = lerp2d_scalar(b, a, d, c); | |
541 | 308 | dst_row1[dst_x] = lerp2d_scalar(c, a, d, b); | |
542 | 308 | dst_row1[dst_x + 1] = lerp2d_scalar(d, b, c, a); | |
543 | 308 | } | |
544 | 148 | }; | |
545 | |||
546 | // Top row | ||
547 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
|
50 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
548 | 49 | process_edge_row(src, dst); | |
549 | 49 | } | |
550 | |||
551 | // Middle rows | ||
552 |
2/2✓ Branch 0 taken 148 times.
✓ Branch 1 taken 50 times.
|
198 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
553 | 148 | size_t dst_y = src_y * 2 + 1; | |
554 | 148 | const float *src_row0 = src + src_stride * src_y; | |
555 | 148 | const float *src_row1 = src_row0 + src_stride; | |
556 | 148 | float *dst_row0 = dst + dst_stride * dst_y; | |
557 | 148 | float *dst_row1 = dst_row0 + dst_stride; | |
558 | |||
559 | 148 | process_row(src_row0, src_row1, dst_row0, dst_row1); | |
560 | 148 | } | |
561 | |||
562 | // Bottom row | ||
563 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
|
50 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
564 | 98 | process_edge_row(src + src_stride * (src_height - 1), | |
565 | 49 | dst + dst_stride * (src_height * 2 - 1)); | |
566 | 49 | } | |
567 | |||
568 | 50 | return KLEIDICV_OK; | |
569 | 50 | } | |
570 | |||
571 | 38 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( | |
572 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
573 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
574 | using T = float; | ||
575 | 38 | size_t dst_height = src_height * 4; | |
576 | 38 | size_t dst_width = src_width * 4; | |
577 | 38 | src_stride /= sizeof(T); | |
578 | 38 | dst_stride /= sizeof(T); | |
579 | |||
580 | 1398 | auto lerp1d_scalar = [](T coeff_a, T a, T coeff_b, T b) { | |
581 | 1360 | return coeff_a * a + coeff_b * b; | |
582 | }; | ||
583 | 22182 | auto lerp1d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b) { | |
584 | 22144 | return vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b); | |
585 | }; | ||
586 | 4198 | auto lerp2d_scalar = [](T coeff_a, T a, T coeff_b, T b, T coeff_c, T c, | |
587 | T coeff_d, T d) { | ||
588 | 4160 | return coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d; | |
589 | }; | ||
590 | 17782 | auto lerp2d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b, | |
591 | T coeff_c, float32x4_t c, T coeff_d, float32x4_t d) { | ||
592 | 17744 | return vmlaq_n_f32( | |
593 | 35488 | vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b), c, | |
594 | 17744 | coeff_c), | |
595 | 17744 | d, coeff_d); | |
596 | }; | ||
597 | // Handle top or bottom edge | ||
598 | 112 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
599 | const T *src_row, T *dst_row) { | ||
600 | // Left elements | ||
601 | 74 | dst_row[1] = dst_row[0] = src_row[0]; | |
602 | |||
603 | // Right elements | ||
604 | 74 | dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; | |
605 | |||
606 | // Middle elements | ||
607 | 74 | size_t src_x = 0; | |
608 |
2/2✓ Branch 0 taken 1100 times.
✓ Branch 1 taken 74 times.
|
1174 | for (; src_x + 4 < src_width; src_x += 4) { |
609 | 1100 | size_t dst_x = src_x * 4 + 2; | |
610 | 1100 | float32x4_t a = vld1q_f32(src_row + src_x); | |
611 | 1100 | float32x4_t b = vld1q_f32(src_row + src_x + 1); | |
612 | 1100 | vst4q_f32(dst_row + dst_x, | |
613 | (float32x4x4_t{lerp1d_vector(0.875F, a, 0.125F, b), | ||
614 | lerp1d_vector(0.625F, a, 0.375F, b), | ||
615 | lerp1d_vector(0.375F, a, 0.625F, b), | ||
616 | lerp1d_vector(0.125F, a, 0.875F, b)})); | ||
617 | 1100 | } | |
618 |
2/2✓ Branch 0 taken 76 times.
✓ Branch 1 taken 74 times.
|
150 | for (; src_x + 1 < src_width; ++src_x) { |
619 | 76 | size_t dst_x = src_x * 4 + 2; | |
620 | 76 | const T a = src_row[src_x], b = src_row[src_x + 1]; | |
621 | 76 | dst_row[dst_x + 0] = lerp1d_scalar(0.875F, a, 0.125F, b); | |
622 | 76 | dst_row[dst_x + 1] = lerp1d_scalar(0.625F, a, 0.375F, b); | |
623 | 76 | dst_row[dst_x + 2] = lerp1d_scalar(0.375F, a, 0.625F, b); | |
624 | 76 | dst_row[dst_x + 3] = lerp1d_scalar(0.125F, a, 0.875F, b); | |
625 | 76 | } | |
626 | 74 | }; | |
627 | |||
628 | 170 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector, | |
629 | lerp2d_scalar, lerp2d_vector]( | ||
630 | const T *src_row0, const T *src_row1, T *dst_row0, | ||
631 | T *dst_row1, T *dst_row2, T *dst_row3) { | ||
632 | // Left elements | ||
633 | 132 | const T s0l = src_row0[0], s1l = src_row1[0]; | |
634 | 132 | dst_row0[0] = dst_row0[1] = lerp1d_scalar(0.875F, s0l, 0.125F, s1l); | |
635 | 132 | dst_row1[0] = dst_row1[1] = lerp1d_scalar(0.625F, s0l, 0.375F, s1l); | |
636 | 132 | dst_row2[0] = dst_row2[1] = lerp1d_scalar(0.375F, s0l, 0.625F, s1l); | |
637 | 132 | dst_row3[0] = dst_row3[1] = lerp1d_scalar(0.125F, s0l, 0.875F, s1l); | |
638 | |||
639 | // Right elements | ||
640 | 132 | const T s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; | |
641 | 132 | const size_t dr0 = dst_width - 2; | |
642 | 132 | const size_t dr1 = dst_width - 1; | |
643 | 132 | dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(0.875F, s0r, 0.125F, s1r); | |
644 | 132 | dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(0.625F, s0r, 0.375F, s1r); | |
645 | 132 | dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(0.375F, s0r, 0.625F, s1r); | |
646 | 132 | dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(0.125F, s0r, 0.875F, s1r); | |
647 | |||
648 | // Middle elements | ||
649 | 132 | size_t src_x = 0; | |
650 |
2/2✓ Branch 0 taken 2218 times.
✓ Branch 1 taken 132 times.
|
2350 | for (; src_x + 4 < src_width; src_x += 4) { |
651 | 2218 | size_t dst_x = src_x * 4 + 2; | |
652 | |||
653 | 2218 | float32x4_t a = vld1q_f32(src_row0 + src_x); | |
654 | 2218 | float32x4_t b = vld1q_f32(src_row0 + src_x + 1); | |
655 | 2218 | float32x4_t c = vld1q_f32(src_row1 + src_x); | |
656 | 2218 | float32x4_t d = vld1q_f32(src_row1 + src_x + 1); | |
657 | |||
658 | 2218 | float32x4x4_t dst_a{ | |
659 | 8872 | lerp2d_vector(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d), | |
660 | 2218 | lerp2d_vector(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d), | |
661 | 2218 | lerp2d_vector(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d), | |
662 | 2218 | lerp2d_vector(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d), | |
663 | }; | ||
664 | 2218 | float32x4x4_t dst_d{ | |
665 | 8872 | lerp2d_vector(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d), | |
666 | 2218 | lerp2d_vector(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d), | |
667 | 2218 | lerp2d_vector(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d), | |
668 | 2218 | lerp2d_vector(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d), | |
669 | }; | ||
670 | 2218 | const float one_3rd = 0.3333333333333333F; | |
671 | 2218 | const float two_3rd = 0.6666666666666667F; | |
672 | 2218 | vst4q_f32(dst_row0 + dst_x, dst_a); | |
673 | 2218 | vst4q_f32(dst_row1 + dst_x, | |
674 | (float32x4x4_t{ | ||
675 | lerp1d_vector(two_3rd, dst_a.val[0], one_3rd, dst_d.val[0]), | ||
676 | lerp1d_vector(two_3rd, dst_a.val[1], one_3rd, dst_d.val[1]), | ||
677 | lerp1d_vector(two_3rd, dst_a.val[2], one_3rd, dst_d.val[2]), | ||
678 | lerp1d_vector(two_3rd, dst_a.val[3], one_3rd, dst_d.val[3]), | ||
679 | })); | ||
680 | 2218 | vst4q_f32(dst_row2 + dst_x, | |
681 | (float32x4x4_t{ | ||
682 | lerp1d_vector(one_3rd, dst_a.val[0], two_3rd, dst_d.val[0]), | ||
683 | lerp1d_vector(one_3rd, dst_a.val[1], two_3rd, dst_d.val[1]), | ||
684 | lerp1d_vector(one_3rd, dst_a.val[2], two_3rd, dst_d.val[2]), | ||
685 | lerp1d_vector(one_3rd, dst_a.val[3], two_3rd, dst_d.val[3]), | ||
686 | })); | ||
687 | 2218 | vst4q_f32(dst_row3 + dst_x, dst_d); | |
688 | 2218 | } | |
689 | |||
690 |
2/2✓ Branch 0 taken 260 times.
✓ Branch 1 taken 132 times.
|
392 | for (; src_x + 1 < src_width; ++src_x) { |
691 | 260 | size_t dst_x = src_x * 4 + 2; | |
692 | 260 | const T a = src_row0[src_x], b = src_row0[src_x + 1], c = src_row1[src_x], | |
693 | 260 | d = src_row1[src_x + 1]; | |
694 | |||
695 | 260 | dst_row0[dst_x + 0] = | |
696 | 260 | lerp2d_scalar(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d); | |
697 | 260 | dst_row0[dst_x + 1] = | |
698 | 260 | lerp2d_scalar(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d); | |
699 | 260 | dst_row0[dst_x + 2] = | |
700 | 260 | lerp2d_scalar(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d); | |
701 | 260 | dst_row0[dst_x + 3] = | |
702 | 260 | lerp2d_scalar(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d); | |
703 | 260 | dst_row1[dst_x + 0] = | |
704 | 260 | lerp2d_scalar(0.546875F, a, 0.078125F, b, 0.328125F, c, 0.046875F, d); | |
705 | 260 | dst_row1[dst_x + 1] = | |
706 | 260 | lerp2d_scalar(0.390625F, a, 0.234375F, b, 0.234375F, c, 0.140625F, d); | |
707 | 260 | dst_row1[dst_x + 2] = | |
708 | 260 | lerp2d_scalar(0.234375F, a, 0.390625F, b, 0.140625F, c, 0.234375F, d); | |
709 | 260 | dst_row1[dst_x + 3] = | |
710 | 260 | lerp2d_scalar(0.078125F, a, 0.546875F, b, 0.046875F, c, 0.328125F, d); | |
711 | 260 | dst_row2[dst_x + 0] = | |
712 | 260 | lerp2d_scalar(0.328125F, a, 0.046875F, b, 0.546875F, c, 0.078125F, d); | |
713 | 260 | dst_row2[dst_x + 1] = | |
714 | 260 | lerp2d_scalar(0.234375F, a, 0.140625F, b, 0.390625F, c, 0.234375F, d); | |
715 | 260 | dst_row2[dst_x + 2] = | |
716 | 260 | lerp2d_scalar(0.140625F, a, 0.234375F, b, 0.234375F, c, 0.390625F, d); | |
717 | 260 | dst_row2[dst_x + 3] = | |
718 | 260 | lerp2d_scalar(0.046875F, a, 0.328125F, b, 0.078125F, c, 0.546875F, d); | |
719 | 260 | dst_row3[dst_x + 0] = | |
720 | 260 | lerp2d_scalar(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d); | |
721 | 260 | dst_row3[dst_x + 1] = | |
722 | 260 | lerp2d_scalar(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d); | |
723 | 260 | dst_row3[dst_x + 2] = | |
724 | 260 | lerp2d_scalar(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d); | |
725 | 260 | dst_row3[dst_x + 3] = | |
726 | 260 | lerp2d_scalar(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d); | |
727 | 260 | } | |
728 | 132 | }; | |
729 | |||
730 | // Top rows | ||
731 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 37 times.
|
38 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
732 | 37 | process_edge_row(src, dst); | |
733 | 37 | memcpy(dst + dst_stride, dst, dst_stride * sizeof(T)); | |
734 | 37 | } | |
735 | |||
736 | // Middle rows | ||
737 |
2/2✓ Branch 0 taken 132 times.
✓ Branch 1 taken 38 times.
|
170 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
738 | 132 | size_t dst_y = src_y * 4 + 2; | |
739 | 132 | const T *src_row0 = src + src_stride * src_y; | |
740 | 132 | const T *src_row1 = src_row0 + src_stride; | |
741 | 132 | T *dst_row0 = dst + dst_stride * dst_y; | |
742 | 132 | T *dst_row1 = dst_row0 + dst_stride; | |
743 | 132 | T *dst_row2 = dst_row1 + dst_stride; | |
744 | 132 | T *dst_row3 = dst_row2 + dst_stride; | |
745 | |||
746 | 132 | process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); | |
747 | 132 | } | |
748 | |||
749 | // Bottom rows | ||
750 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 37 times.
|
38 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
751 | 74 | process_edge_row(src + src_stride * (src_height - 1), | |
752 | 37 | dst + dst_stride * (dst_height - 2)); | |
753 | 111 | memcpy(dst + dst_stride * (dst_height - 1), | |
754 | 74 | dst + dst_stride * (dst_height - 2), dst_stride * sizeof(T)); | |
755 | 37 | } | |
756 | |||
757 | 38 | return KLEIDICV_OK; | |
758 | 38 | } | |
759 | |||
760 | 34 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( | |
761 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
762 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
763 | 34 | size_t dst_width = src_width * 8; | |
764 | 34 | size_t dst_height = src_height * 8; | |
765 | 34 | src_stride /= sizeof(float); | |
766 | 34 | dst_stride /= sizeof(float); | |
767 | |||
768 | 34 | float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0, | |
769 | 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0}; | ||
770 | 34 | float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0, | |
771 | 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0}; | ||
772 | 34 | float32x4_t coeffs_a0 = vld1q_f32(&coeffs_a[0]); | |
773 | 34 | float32x4_t coeffs_a1 = vld1q_f32(&coeffs_a[4]); | |
774 | 34 | float32x4_t coeffs_b0 = vld1q_f32(&coeffs_b[0]); | |
775 | 34 | float32x4_t coeffs_b1 = vld1q_f32(&coeffs_b[4]); | |
776 | |||
777 | 2082 | auto lerp1d_vector_n = [](float p, float32x4_t a, float q, float32x4_t b) { | |
778 | 2048 | return vmlaq_n_f32(vmulq_n_f32(a, p), b, q); | |
779 | }; | ||
780 | |||
781 | 109570 | auto lerp1d_vector_n2 = [](float32x4_t a, float q, float32x4_t b) { | |
782 | 109536 | return vmlaq_n_f32(a, b, q); | |
783 | }; | ||
784 | |||
785 | 8970 | auto lerp1d_vector = [](float32x4_t p, float32x4_t a, float32x4_t q, | |
786 | float32x4_t b) { | ||
787 | 8936 | return vmlaq_f32(vmulq_f32(a, p), b, q); | |
788 | }; | ||
789 | |||
790 | // Handle top or bottom edge | ||
791 | 34 | auto process_edge_row = | |
792 | 100 | [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, | |
793 | &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) { | ||
794 | // Left elements | ||
795 | 66 | dst_row[3] = dst_row[2] = dst_row[1] = dst_row[0] = src_row[0]; | |
796 | 66 | dst_row[dst_stride + 3] = dst_row[dst_stride + 2] = | |
797 | 66 | dst_row[dst_stride + 1] = dst_row[dst_stride] = src_row[0]; | |
798 | 66 | dst_row[2 * dst_stride + 3] = dst_row[2 * dst_stride + 2] = | |
799 | 66 | dst_row[2 * dst_stride + 1] = dst_row[2 * dst_stride] = src_row[0]; | |
800 | 66 | dst_row[3 * dst_stride + 3] = dst_row[3 * dst_stride + 2] = | |
801 | 66 | dst_row[3 * dst_stride + 1] = dst_row[3 * dst_stride] = src_row[0]; | |
802 | |||
803 | // Right elements | ||
804 | 66 | float *dst_right = dst_row + dst_width - 4; | |
805 | 66 | dst_right[3] = dst_right[2] = dst_right[1] = dst_right[0] = | |
806 | 66 | src_row[src_width - 1]; | |
807 | 66 | dst_right[dst_stride + 3] = dst_right[dst_stride + 2] = | |
808 | 66 | dst_right[dst_stride + 1] = dst_right[dst_stride] = | |
809 | 66 | src_row[src_width - 1]; | |
810 | 66 | dst_right[2 * dst_stride + 3] = dst_right[2 * dst_stride + 2] = | |
811 | 66 | dst_right[2 * dst_stride + 1] = dst_right[2 * dst_stride] = | |
812 | 66 | src_row[src_width - 1]; | |
813 | 66 | dst_right[3 * dst_stride + 3] = dst_right[3 * dst_stride + 2] = | |
814 | 66 | dst_right[3 * dst_stride + 1] = dst_right[3 * dst_stride] = | |
815 | 66 | src_row[src_width - 1]; | |
816 | |||
817 | // Middle elements | ||
818 | 66 | float32x4_t a, b = vdupq_n_f32(src_row[0]); | |
819 |
2/2✓ Branch 0 taken 66 times.
✓ Branch 1 taken 4468 times.
|
4534 | for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { |
820 | 4468 | a = b; | |
821 | 4468 | b = vdupq_n_f32(src_row[src_x + 1]); | |
822 | 4468 | float *dst_row0 = dst_row + src_x * 8 + 4; | |
823 | 4468 | float *dst_row1 = dst_row0 + dst_stride; | |
824 | 4468 | float *dst_row2 = dst_row1 + dst_stride; | |
825 | 4468 | float *dst_row3 = dst_row2 + dst_stride; | |
826 | 4468 | float32x4_t dst = lerp1d_vector(coeffs_a0, a, coeffs_b0, b); | |
827 | 4468 | vst1q(dst_row0, dst); | |
828 | 4468 | vst1q(dst_row1, dst); | |
829 | 4468 | vst1q(dst_row2, dst); | |
830 | 4468 | vst1q(dst_row3, dst); | |
831 | 4468 | dst = lerp1d_vector(coeffs_a1, a, coeffs_b1, b); | |
832 | 4468 | vst1q(dst_row0 + 4, dst); | |
833 | 4468 | vst1q(dst_row1 + 4, dst); | |
834 | 4468 | vst1q(dst_row2 + 4, dst); | |
835 | 4468 | vst1q(dst_row3 + 4, dst); | |
836 | 4468 | } | |
837 | 66 | }; | |
838 | |||
839 | 34 | float32x4_t coeffs_p0 = vmulq_n_f32(coeffs_a0, 15.0 / 16); | |
840 | 34 | float32x4_t coeffs_q0 = vmulq_n_f32(coeffs_b0, 15.0 / 16); | |
841 | 34 | float32x4_t coeffs_r0 = vmulq_n_f32(coeffs_a0, 1.0 / 16); | |
842 | 34 | float32x4_t coeffs_s0 = vmulq_n_f32(coeffs_b0, 1.0 / 16); | |
843 | 34 | float32x4_t coeffs_p1 = vmulq_n_f32(coeffs_a1, 15.0 / 16); | |
844 | 34 | float32x4_t coeffs_q1 = vmulq_n_f32(coeffs_b1, 15.0 / 16); | |
845 | 34 | float32x4_t coeffs_r1 = vmulq_n_f32(coeffs_a1, 1.0 / 16); | |
846 | 34 | float32x4_t coeffs_s1 = vmulq_n_f32(coeffs_b1, 1.0 / 16); | |
847 | |||
848 | 36546 | auto lerp2d_vector = [](float32x4_t a, float32x4_t p, float32x4_t b, | |
849 | float32x4_t q, float32x4_t c, float32x4_t r, | ||
850 | float32x4_t d, float32x4_t s) { | ||
851 | 36512 | return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(a, p), b, q), c, r), d, s); | |
852 | }; | ||
853 | |||
854 | 162 | auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n, | |
855 | lerp1d_vector_n2, &coeffs_p0, &coeffs_q0, &coeffs_r0, | ||
856 | &coeffs_s0, &coeffs_p1, &coeffs_q1, &coeffs_r1, | ||
857 | &coeffs_s1](const float *src_row0, const float *src_row1, | ||
858 | float *dst_row0, size_t dst_stride) { | ||
859 | // Left elements | ||
860 | 128 | float32x4_t s0 = vdupq_n_f32(src_row0[0]); | |
861 | 128 | float32x4_t s1 = vdupq_n_f32(src_row1[0]); | |
862 | 128 | float *dst_row = dst_row0; | |
863 |
2/2✓ Branch 0 taken 1024 times.
✓ Branch 1 taken 128 times.
|
1152 | for (size_t i = 0; i < 8; ++i) { |
864 | 2048 | vst1q(dst_row, | |
865 | 2048 | lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0, | |
866 | 1024 | static_cast<float>(i * 2 + 1) / 16.0F, s1)); | |
867 | 1024 | dst_row += dst_stride; | |
868 | 1024 | } | |
869 | |||
870 | // Middle elements | ||
871 | 128 | dst_row0 += 4; | |
872 | 128 | float *dst_row1 = dst_row0 + dst_stride; | |
873 | 128 | float *dst_row2 = dst_row1 + dst_stride; | |
874 | 128 | float *dst_row3 = dst_row2 + dst_stride; | |
875 | 128 | float *dst_row4 = dst_row3 + dst_stride; | |
876 | 128 | float *dst_row5 = dst_row4 + dst_stride; | |
877 | 128 | float *dst_row6 = dst_row5 + dst_stride; | |
878 | 128 | float *dst_row7 = dst_row6 + dst_stride; | |
879 | 128 | float32x4_t a, b = s0; | |
880 | 128 | float32x4_t c, d = s1; | |
881 |
2/2✓ Branch 0 taken 9128 times.
✓ Branch 1 taken 128 times.
|
9256 | for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { |
882 | 9128 | a = b; | |
883 | 9128 | b = vdupq_n_f32(src_row0[src_x + 1]); | |
884 | 9128 | c = d; | |
885 | 9128 | d = vdupq_n_f32(src_row1[src_x + 1]); | |
886 | 9128 | float32x4x2_t dst_0; | |
887 | 9128 | dst_0.val[0] = | |
888 | 9128 | lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d); | |
889 | 9128 | dst_0.val[1] = | |
890 | 9128 | lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d); | |
891 | |||
892 | 9128 | neon::VecTraits<float>::store(dst_0, dst_row0); | |
893 | 9128 | float32x4x2_t dst_7; | |
894 | 9128 | dst_7.val[0] = | |
895 | 9128 | lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d); | |
896 | 9128 | dst_7.val[1] = | |
897 | 9128 | lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d); | |
898 | |||
899 | 9128 | neon::VecTraits<float>::store(dst_7, dst_row7); | |
900 | 9128 | float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]); | |
901 | 9128 | float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]); | |
902 | |||
903 | 9128 | float32x4x2_t dst; | |
904 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0); | |
905 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1); | |
906 | |||
907 | 9128 | neon::VecTraits<float>::store(dst, dst_row1); | |
908 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0); | |
909 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1); | |
910 | |||
911 | 9128 | neon::VecTraits<float>::store(dst, dst_row2); | |
912 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0); | |
913 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1); | |
914 | |||
915 | 9128 | neon::VecTraits<float>::store(dst, dst_row3); | |
916 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0); | |
917 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1); | |
918 | |||
919 | 9128 | neon::VecTraits<float>::store(dst, dst_row4); | |
920 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0); | |
921 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1); | |
922 | |||
923 | 9128 | neon::VecTraits<float>::store(dst, dst_row5); | |
924 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0); | |
925 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1); | |
926 | |||
927 | 9128 | neon::VecTraits<float>::store(dst, dst_row6); | |
928 | 9128 | dst_row0 += 8; | |
929 | 9128 | dst_row1 += 8; | |
930 | 9128 | dst_row2 += 8; | |
931 | 9128 | dst_row3 += 8; | |
932 | 9128 | dst_row4 += 8; | |
933 | 9128 | dst_row5 += 8; | |
934 | 9128 | dst_row6 += 8; | |
935 | 9128 | dst_row7 += 8; | |
936 | 9128 | } | |
937 | |||
938 | // Right elements | ||
939 | 128 | s0 = b; | |
940 | 128 | s1 = d; | |
941 | 128 | dst_row = dst_row0; | |
942 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 1024 times.
|
1152 | for (size_t i = 0; i < 8; ++i) { |
943 | 2048 | vst1q(dst_row, | |
944 | 2048 | lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0, | |
945 | 1024 | static_cast<float>(i * 2 + 1) / 16.0F, s1)); | |
946 | 1024 | dst_row += dst_stride; | |
947 | 1024 | } | |
948 | 128 | }; | |
949 | |||
950 | // Top rows | ||
951 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
952 | 33 | process_edge_row(src, dst, dst_stride); | |
953 | 33 | } | |
954 | |||
955 | // Middle rows | ||
956 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
|
162 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
957 | 128 | size_t dst_y = src_y * 8 + 4; | |
958 | 128 | const float *src_row0 = src + src_stride * src_y; | |
959 | 128 | const float *src_row1 = src_row0 + src_stride; | |
960 | 128 | process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride); | |
961 | 128 | } | |
962 | |||
963 | // Bottom rows | ||
964 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
965 | 66 | process_edge_row(src + src_stride * (src_height - 1), | |
966 | 33 | dst + dst_stride * (dst_height - 4), dst_stride); | |
967 | 33 | } | |
968 | |||
969 | 34 | return KLEIDICV_OK; | |
970 | 34 | } | |
971 | |||
972 | 129 | kleidicv_error_t resize_linear_stripe_f32(const float *src, size_t src_stride, | |
973 | size_t src_width, size_t src_height, | ||
974 | size_t y_begin, size_t y_end, | ||
975 | float *dst, size_t dst_stride, | ||
976 | size_t dst_width, size_t dst_height) { | ||
977 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 128 times.
|
129 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
978 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 127 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 127 times.
|
128 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
979 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 126 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 125 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 125 times.
|
127 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
980 | |||
981 |
4/4✓ Branch 0 taken 123 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 122 times.
|
125 | if (src_width == 0 || src_height == 0) { |
982 | 3 | return KLEIDICV_OK; | |
983 | } | ||
984 |
3/4✓ Branch 0 taken 50 times.
✓ Branch 1 taken 72 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 50 times.
|
122 | if (src_width * 2 == dst_width && src_height * 2 == dst_height) { |
985 | 100 | return resize_2x2_f32(src, src_stride, src_width, src_height, y_begin, | |
986 | 50 | y_end, dst, dst_stride); | |
987 | } | ||
988 |
3/4✓ Branch 0 taken 38 times.
✓ Branch 1 taken 34 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 38 times.
|
72 | if (src_width * 4 == dst_width && src_height * 4 == dst_height) { |
989 | 76 | return resize_4x4_f32(src, src_stride, src_width, src_height, y_begin, | |
990 | 38 | y_end, dst, dst_stride); | |
991 | } | ||
992 |
2/4✓ Branch 0 taken 34 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 34 times.
|
34 | if (src_width * 8 == dst_width && src_height * 8 == dst_height) { |
993 | 68 | return resize_8x8_f32(src, src_stride, src_width, src_height, y_begin, | |
994 | 34 | y_end, dst, dst_stride); | |
995 | } | ||
996 | // resize_linear_f32_is_implemented checked the kernel size already. | ||
997 | // GCOVR_EXCL_START | ||
998 | assert(!"resize ratio not implemented"); | ||
999 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1000 | // GCOVR_EXCL_STOP | ||
1001 | 129 | } | |
1002 | |||
1003 | } // namespace kleidicv::neon | ||
1004 |