| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cassert> | ||
| 6 | |||
| 7 | #include "kleidicv/neon.h" | ||
| 8 | #include "kleidicv/resize/resize_linear.h" | ||
| 9 | |||
| 10 | namespace kleidicv::neon { | ||
| 11 | |||
| 12 | template <uint8_t P, uint8_t Q, uint8_t Bias, uint8_t Shift> | ||
| 13 | 9568 | uint8x8_t lerp2d_vector_p_q_q_1(uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 14 | uint8x8_t d) { | ||
| 15 | // b + c | ||
| 16 | 9568 | uint16x8_t b_c = vaddl_u8(b, c); | |
| 17 | |||
| 18 | // a * p | ||
| 19 | 9568 | uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); | |
| 20 | |||
| 21 | // a * p + (b + c) * q | ||
| 22 | 9568 | uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); | |
| 23 | |||
| 24 | // d + bias | ||
| 25 | 9568 | uint16x8_t d_bias = vaddl_u8(d, vdup_n_u8(Bias)); | |
| 26 | |||
| 27 | // a * p + (b + c) * q + d + bias | ||
| 28 | 9568 | uint16x8_t ap_bcq_d_bias = vaddq_u16(ap_bcq, d_bias); | |
| 29 | |||
| 30 | // (a * p + (b + c) * q + d + bias) >> shift | ||
| 31 | 9568 | uint8x8_t result = vshrn_n_u16(ap_bcq_d_bias, Shift); | |
| 32 | 19136 | return result; | |
| 33 | 9568 | } | |
| 34 | |||
| 35 | template <uint8_t P, uint8_t Q, uint8_t R, uint8_t Bias, uint8_t Shift> | ||
| 36 | 4784 | uint8x8_t lerp2d_vector_p_q_q_r(uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 37 | uint8x8_t d) { | ||
| 38 | // b + c | ||
| 39 | 4784 | uint16x8_t b_c = vaddl_u8(b, c); | |
| 40 | |||
| 41 | // a * p | ||
| 42 | 4784 | uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); | |
| 43 | |||
| 44 | // d * r | ||
| 45 | 4784 | uint16x8_t dr = vmull_u8(d, vdup_n_u8(R)); | |
| 46 | |||
| 47 | // a * p + (b + c) * q | ||
| 48 | 4784 | uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); | |
| 49 | |||
| 50 | // d * r + bias | ||
| 51 | 4784 | uint16x8_t dr_bias = vaddq_u16(dr, vdupq_n_u16(Bias)); | |
| 52 | |||
| 53 | // a * p + (b + c) * q + d * r + bias | ||
| 54 | 4784 | uint16x8_t ap_bcq_dr_bias = vaddq_u16(ap_bcq, dr_bias); | |
| 55 | |||
| 56 | // (a * p + (b + c) * q + d * r + bias) >> shift | ||
| 57 | 4784 | uint8x8_t result = vshrn_n_u16(ap_bcq_dr_bias, Shift); | |
| 58 | 9568 | return result; | |
| 59 | 4784 | } | |
| 60 | |||
| 61 | 78 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t kleidicv_resize_2x2_stripe_u8( | |
| 62 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 63 | size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { | ||
| 64 | 78 | size_t dst_width = src_width * 2; | |
| 65 | |||
| 66 | 1358 | auto lerp1d_scalar = [](uint8_t near, uint8_t far) { | |
| 67 | 1280 | return (near * 3 + far + 2) >> 2; | |
| 68 | }; | ||
| 69 | |||
| 70 | 1206 | auto lerp1d_vector = [](uint8x8_t near, uint8x8_t far) { | |
| 71 | 1128 | uint8x8_t three = vdup_n_u8(3); | |
| 72 | 1128 | uint8x8_t two = vdup_n_u8(2); | |
| 73 | |||
| 74 | // near * 3 | ||
| 75 | 1128 | uint16x8_t near3 = vmull_u8(near, three); | |
| 76 | |||
| 77 | // far + 2 | ||
| 78 | 1128 | uint16x8_t far_2 = vaddl_u8(far, two); | |
| 79 | |||
| 80 | // near * 3 + far * 2 | ||
| 81 | 1128 | uint16x8_t near3_far_2 = vaddq_u16(near3, far_2); | |
| 82 | |||
| 83 | // (near * 3 + far * 2) / 4 | ||
| 84 | 1128 | uint8x8_t near3_far_2_div4 = vshrn_n_u16(near3_far_2, 2); | |
| 85 | |||
| 86 | 2256 | return near3_far_2_div4; | |
| 87 | 1128 | }; | |
| 88 | |||
| 89 | 2358 | auto lerp2d_scalar = [](uint8_t near, uint8_t mid_a, uint8_t mid_b, | |
| 90 | uint8_t far) { | ||
| 91 | 2280 | return (near * 9 + (mid_a + mid_b) * 3 + far + 8) >> 4; | |
| 92 | }; | ||
| 93 | |||
| 94 | 4862 | auto lerp2d_vector = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, uint8x8_t d) { | |
| 95 | 4784 | return lerp2d_vector_p_q_q_1<9, 3, 8, 4>(a, b, c, d); | |
| 96 | }; | ||
| 97 | |||
| 98 | // Handle top or bottom edge | ||
| 99 | 200 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 100 | const uint8_t *src_row, uint8_t *dst_row) { | ||
| 101 | // Left element | ||
| 102 | 122 | dst_row[0] = src_row[0]; | |
| 103 | |||
| 104 | // Right element | ||
| 105 | 122 | dst_row[dst_width - 1] = src_row[src_width - 1]; | |
| 106 | |||
| 107 | // Middle elements | ||
| 108 | 122 | size_t src_x = 0; | |
| 109 |
2/2✓ Branch 0 taken 564 times.
✓ Branch 1 taken 122 times.
|
686 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 110 | 564 | size_t dst_x = src_x * 2 + 1; | |
| 111 | 564 | uint8x8_t src_left = vld1_u8(src_row + src_x); | |
| 112 | 564 | uint8x8_t src_right = vld1_u8(src_row + src_x + 1); | |
| 113 | |||
| 114 | 564 | uint8x8_t dst_left = lerp1d_vector(src_left, src_right); | |
| 115 | 564 | uint8x8_t dst_right = lerp1d_vector(src_right, src_left); | |
| 116 | |||
| 117 | 564 | vst2_u8(dst_row + dst_x, (uint8x8x2_t{dst_left, dst_right})); | |
| 118 | 564 | } | |
| 119 |
2/2✓ Branch 0 taken 180 times.
✓ Branch 1 taken 122 times.
|
302 | for (; src_x + 1 < src_width; ++src_x) { |
| 120 | 180 | size_t dst_x = src_x * 2 + 1; | |
| 121 | 180 | const uint8_t src_left = src_row[src_x], src_right = src_row[src_x + 1]; | |
| 122 | 180 | dst_row[dst_x] = lerp1d_scalar(src_left, src_right); | |
| 123 | 180 | dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left); | |
| 124 | 180 | } | |
| 125 | 122 | }; | |
| 126 | |||
| 127 | 308 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
| 128 | lerp2d_vector](const uint8_t *src_row0, | ||
| 129 | const uint8_t *src_row1, uint8_t *dst_row0, | ||
| 130 | uint8_t *dst_row1) { | ||
| 131 | // Left element | ||
| 132 | 230 | dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); | |
| 133 | 230 | dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); | |
| 134 | |||
| 135 | // Right element | ||
| 136 | 230 | dst_row0[dst_width - 1] = | |
| 137 | 230 | lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); | |
| 138 | 230 | dst_row1[dst_width - 1] = | |
| 139 | 230 | lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); | |
| 140 | |||
| 141 | // Middle elements | ||
| 142 | 230 | size_t src_x = 0; | |
| 143 |
2/2✓ Branch 0 taken 1196 times.
✓ Branch 1 taken 230 times.
|
1426 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 144 | 1196 | size_t dst_x = src_x * 2 + 1; | |
| 145 | |||
| 146 | 1196 | uint8x8_t src_tl = vld1_u8(src_row0 + src_x); | |
| 147 | 1196 | uint8x8_t src_tr = vld1_u8(src_row0 + src_x + 1); | |
| 148 | 1196 | uint8x8_t src_bl = vld1_u8(src_row1 + src_x); | |
| 149 | 1196 | uint8x8_t src_br = vld1_u8(src_row1 + src_x + 1); | |
| 150 | |||
| 151 | 1196 | uint8x8_t dst_tl = lerp2d_vector(src_tl, src_tr, src_bl, src_br); | |
| 152 | 1196 | uint8x8_t dst_tr = lerp2d_vector(src_tr, src_tl, src_br, src_bl); | |
| 153 | 1196 | uint8x8_t dst_bl = lerp2d_vector(src_bl, src_tl, src_br, src_tr); | |
| 154 | 1196 | uint8x8_t dst_br = lerp2d_vector(src_br, src_tr, src_bl, src_tl); | |
| 155 | |||
| 156 | 1196 | vst2_u8(dst_row0 + dst_x, (uint8x8x2_t{dst_tl, dst_tr})); | |
| 157 | 1196 | vst2_u8(dst_row1 + dst_x, (uint8x8x2_t{dst_bl, dst_br})); | |
| 158 | 1196 | } | |
| 159 |
2/2✓ Branch 0 taken 570 times.
✓ Branch 1 taken 230 times.
|
800 | for (; src_x + 1 < src_width; ++src_x) { |
| 160 | 570 | size_t dst_x = src_x * 2 + 1; | |
| 161 | 1140 | const uint8_t src_tl = src_row0[src_x], src_tr = src_row0[src_x + 1], | |
| 162 | 1140 | src_bl = src_row1[src_x], src_br = src_row1[src_x + 1]; | |
| 163 | 570 | dst_row0[dst_x] = lerp2d_scalar(src_tl, src_tr, src_bl, src_br); | |
| 164 | 570 | dst_row0[dst_x + 1] = lerp2d_scalar(src_tr, src_tl, src_br, src_bl); | |
| 165 | 570 | dst_row1[dst_x] = lerp2d_scalar(src_bl, src_tl, src_br, src_tr); | |
| 166 | 570 | dst_row1[dst_x + 1] = lerp2d_scalar(src_br, src_tr, src_bl, src_tl); | |
| 167 | 570 | } | |
| 168 | 230 | }; | |
| 169 | |||
| 170 | // Top row | ||
| 171 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 61 times.
|
78 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 172 | 61 | process_edge_row(src, dst); | |
| 173 | 61 | } | |
| 174 | |||
| 175 | // Middle rows | ||
| 176 |
2/2✓ Branch 0 taken 230 times.
✓ Branch 1 taken 78 times.
|
308 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 177 | 230 | size_t dst_y = src_y * 2 + 1; | |
| 178 | 230 | const uint8_t *src_row0 = src + src_stride * src_y; | |
| 179 | 230 | const uint8_t *src_row1 = src_row0 + src_stride; | |
| 180 | 230 | uint8_t *dst_row0 = dst + dst_stride * dst_y; | |
| 181 | 230 | uint8_t *dst_row1 = dst_row0 + dst_stride; | |
| 182 | |||
| 183 | 230 | process_row(src_row0, src_row1, dst_row0, dst_row1); | |
| 184 | 230 | } | |
| 185 | |||
| 186 | // Bottom row | ||
| 187 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 61 times.
|
78 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 188 | 122 | process_edge_row(src + src_stride * (src_height - 1), | |
| 189 | 61 | dst + dst_stride * (2 * src_height - 1)); | |
| 190 | 61 | } | |
| 191 | |||
| 192 | 78 | return KLEIDICV_OK; | |
| 193 | 78 | } | |
| 194 | |||
| 195 | 58 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t kleidicv_resize_4x4_stripe_u8( | |
| 196 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 197 | size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { | ||
| 198 | 58 | size_t dst_width = src_width * 4, dst_height = src_height * 4; | |
| 199 | |||
| 200 | 2234 | auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, | |
| 201 | uint8_t b) { | ||
| 202 | 2176 | return (coeff_a * a + coeff_b * b + 4) >> 3; | |
| 203 | }; | ||
| 204 | 2314 | auto lerp1d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, | |
| 205 | uint8_t coeff_b_scalar, uint8x8_t b) { | ||
| 206 | 2256 | uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); | |
| 207 | 2256 | uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); | |
| 208 | 2256 | uint16x8_t four = vdupq_n_u16(4); | |
| 209 | |||
| 210 | // a * coeff_a | ||
| 211 | 2256 | uint16x8_t a1 = vmull_u8(a, coeff_a); | |
| 212 | |||
| 213 | // b * coeff_b | ||
| 214 | 2256 | uint16x8_t b1 = vmull_u8(b, coeff_b); | |
| 215 | |||
| 216 | // a * coeff_a + b * coeff_b | ||
| 217 | 2256 | uint16x8_t a1_b1 = vaddq_u16(a1, b1); | |
| 218 | |||
| 219 | // a * coeff_a + b * coeff_b + 4 | ||
| 220 | 2256 | uint16x8_t a1_b1_4 = vaddq_u16(a1_b1, four); | |
| 221 | |||
| 222 | // (a * coeff_a + b * coeff_b + 4) / 8 | ||
| 223 | 2256 | uint8x8_t result = vshrn_n_u16(a1_b1_4, 3); | |
| 224 | |||
| 225 | 4512 | return result; | |
| 226 | 2256 | }; | |
| 227 | 8346 | auto lerp2d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, | |
| 228 | uint8_t b, uint8_t coeff_c, uint8_t c, | ||
| 229 | uint8_t coeff_d, uint8_t d) { | ||
| 230 | 8288 | return (coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d + 32) >> 6; | |
| 231 | }; | ||
| 232 | 9626 | auto lerp2d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, | |
| 233 | uint8_t coeff_b_scalar, uint8x8_t b, | ||
| 234 | uint8_t coeff_c_scalar, uint8x8_t c, | ||
| 235 | uint8_t coeff_d_scalar, uint8x8_t d) { | ||
| 236 | 9568 | uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); | |
| 237 | 9568 | uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); | |
| 238 | 9568 | uint8x8_t coeff_c = vdup_n_u8(coeff_c_scalar); | |
| 239 | 9568 | uint8x8_t coeff_d = vdup_n_u8(coeff_d_scalar); | |
| 240 | 9568 | uint16x8_t thirtytwo = vdupq_n_u16(32); | |
| 241 | |||
| 242 | // a * coeff_a | ||
| 243 | 9568 | uint16x8_t a1 = vmull_u8(a, coeff_a); | |
| 244 | |||
| 245 | // b * coeff_b | ||
| 246 | 9568 | uint16x8_t b1 = vmull_u8(b, coeff_b); | |
| 247 | |||
| 248 | // c * coeff_c | ||
| 249 | 9568 | uint16x8_t c1 = vmull_u8(c, coeff_c); | |
| 250 | |||
| 251 | // d * coeff_d | ||
| 252 | 9568 | uint16x8_t d1 = vmull_u8(d, coeff_d); | |
| 253 | |||
| 254 | // a * coeff_a + b * coeff_b | ||
| 255 | 9568 | uint16x8_t a1_b1 = vaddq_u16(a1, b1); | |
| 256 | |||
| 257 | // c * coeff_c + d * coeff_d | ||
| 258 | 9568 | uint16x8_t c1_d1 = vaddq_u16(c1, d1); | |
| 259 | |||
| 260 | // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d | ||
| 261 | 9568 | uint16x8_t a1_b1_c1_d1 = vaddq_u16(a1_b1, c1_d1); | |
| 262 | |||
| 263 | // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32 | ||
| 264 | 9568 | uint16x8_t a1_b1_c1_d1_32 = vaddq_u16(a1_b1_c1_d1, thirtytwo); | |
| 265 | |||
| 266 | // (a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32) / 64 | ||
| 267 | 9568 | uint8x8_t result = vshrn_n_u16(a1_b1_c1_d1_32, 6); | |
| 268 | 19136 | return result; | |
| 269 | 9568 | }; | |
| 270 | // Handle top or bottom edge | ||
| 271 | 140 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 272 | const uint8_t *src_row, uint8_t *dst_row) { | ||
| 273 | // Left elements | ||
| 274 | 82 | dst_row[1] = dst_row[0] = src_row[0]; | |
| 275 | |||
| 276 | // Right elements | ||
| 277 | 82 | dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; | |
| 278 | |||
| 279 | // Middle elements | ||
| 280 | 82 | size_t src_x = 0; | |
| 281 |
2/2✓ Branch 0 taken 564 times.
✓ Branch 1 taken 82 times.
|
646 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 282 | 564 | size_t dst_x = src_x * 4 + 2; | |
| 283 | 564 | uint8x8_t a = vld1_u8(src_row + src_x); | |
| 284 | 564 | uint8x8_t b = vld1_u8(src_row + src_x + 1); | |
| 285 | 564 | uint8x8x4_t interpolated = { | |
| 286 | 1692 | lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b), | |
| 287 | 1128 | lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)}; | |
| 288 | |||
| 289 | 564 | vst4_u8(dst_row + dst_x, interpolated); | |
| 290 | 564 | } | |
| 291 |
2/2✓ Branch 0 taken 124 times.
✓ Branch 1 taken 82 times.
|
206 | for (; src_x + 1 < src_width; ++src_x) { |
| 292 | 124 | size_t dst_x = src_x * 4 + 2; | |
| 293 | 124 | const uint8_t a = src_row[src_x], b = src_row[src_x + 1]; | |
| 294 | 124 | dst_row[dst_x + 0] = lerp1d_scalar(7, a, 1, b); | |
| 295 | 124 | dst_row[dst_x + 1] = lerp1d_scalar(5, a, 3, b); | |
| 296 | 124 | dst_row[dst_x + 2] = lerp1d_scalar(3, a, 5, b); | |
| 297 | 124 | dst_row[dst_x + 3] = lerp1d_scalar(1, a, 7, b); | |
| 298 | 124 | } | |
| 299 | 82 | }; | |
| 300 | |||
| 301 | 268 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
| 302 | lerp2d_vector](const uint8_t *src_row0, | ||
| 303 | const uint8_t *src_row1, uint8_t *dst_row0, | ||
| 304 | uint8_t *dst_row1, uint8_t *dst_row2, | ||
| 305 | uint8_t *dst_row3) { | ||
| 306 | 4994 | auto lerp2d_vector_49_7_7_1 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 307 | uint8x8_t d) { | ||
| 308 | 4784 | return lerp2d_vector_p_q_q_1<49, 7, 32, 6>(a, b, c, d); | |
| 309 | }; | ||
| 310 | 4994 | auto lerp2d_vector_25_15_15_9 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 311 | uint8x8_t d) { | ||
| 312 | 4784 | return lerp2d_vector_p_q_q_r<25, 15, 9, 32, 6>(a, b, c, d); | |
| 313 | }; | ||
| 314 | |||
| 315 | // Left elements | ||
| 316 | 210 | const uint8_t s0l = src_row0[0], s1l = src_row1[0]; | |
| 317 | 210 | dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); | |
| 318 | 210 | dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); | |
| 319 | 210 | dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); | |
| 320 | 210 | dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); | |
| 321 | |||
| 322 | // Right elements | ||
| 323 | 210 | const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; | |
| 324 | 210 | const size_t dr0 = dst_width - 2; | |
| 325 | 210 | const size_t dr1 = dst_width - 1; | |
| 326 | 210 | dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); | |
| 327 | 210 | dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); | |
| 328 | 210 | dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); | |
| 329 | 210 | dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); | |
| 330 | |||
| 331 | // Middle elements | ||
| 332 | 210 | size_t src_x = 0; | |
| 333 |
2/2✓ Branch 0 taken 1196 times.
✓ Branch 1 taken 210 times.
|
1406 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 334 | 1196 | size_t dst_x = src_x * 4 + 2; | |
| 335 | |||
| 336 | 1196 | uint8x8_t a = vld1_u8(src_row0 + src_x); | |
| 337 | 1196 | uint8x8_t b = vld1_u8(src_row0 + src_x + 1); | |
| 338 | 1196 | uint8x8_t c = vld1_u8(src_row1 + src_x); | |
| 339 | 1196 | uint8x8_t d = vld1_u8(src_row1 + src_x + 1); | |
| 340 | |||
| 341 | 2392 | vst4_u8(dst_row0 + dst_x, (uint8x8x4_t{ | |
| 342 | 4784 | lerp2d_vector_49_7_7_1(a, b, c, d), | |
| 343 | 1196 | lerp2d_vector(35, a, 21, b, 5, c, 3, d), | |
| 344 | 1196 | lerp2d_vector(21, a, 35, b, 3, c, 5, d), | |
| 345 | 1196 | lerp2d_vector_49_7_7_1(b, a, d, c), | |
| 346 | })); | ||
| 347 | 2392 | vst4_u8(dst_row1 + dst_x, (uint8x8x4_t{ | |
| 348 | 4784 | lerp2d_vector(35, a, 5, b, 21, c, 3, d), | |
| 349 | 1196 | lerp2d_vector_25_15_15_9(a, b, c, d), | |
| 350 | 1196 | lerp2d_vector_25_15_15_9(b, a, d, c), | |
| 351 | 1196 | lerp2d_vector(5, a, 35, b, 3, c, 21, d), | |
| 352 | })); | ||
| 353 | 2392 | vst4_u8(dst_row2 + dst_x, (uint8x8x4_t{ | |
| 354 | 4784 | lerp2d_vector(21, a, 3, b, 35, c, 5, d), | |
| 355 | 1196 | lerp2d_vector_25_15_15_9(c, a, d, b), | |
| 356 | 1196 | lerp2d_vector_25_15_15_9(d, b, c, a), | |
| 357 | 1196 | lerp2d_vector(3, a, 21, b, 5, c, 35, d), | |
| 358 | })); | ||
| 359 | 2392 | vst4_u8(dst_row3 + dst_x, (uint8x8x4_t{ | |
| 360 | 4784 | lerp2d_vector_49_7_7_1(c, a, d, b), | |
| 361 | 1196 | lerp2d_vector(5, a, 3, b, 35, c, 21, d), | |
| 362 | 1196 | lerp2d_vector(3, a, 5, b, 21, c, 35, d), | |
| 363 | 1196 | lerp2d_vector_49_7_7_1(d, b, c, a), | |
| 364 | })); | ||
| 365 | 1196 | } | |
| 366 |
2/2✓ Branch 0 taken 518 times.
✓ Branch 1 taken 210 times.
|
728 | for (; src_x + 1 < src_width; ++src_x) { |
| 367 | 518 | size_t dst_x = src_x * 4 + 2; | |
| 368 | 1036 | const uint8_t a = src_row0[src_x], b = src_row0[src_x + 1], | |
| 369 | 1036 | c = src_row1[src_x], d = src_row1[src_x + 1]; | |
| 370 | |||
| 371 | 518 | dst_row0[dst_x + 0] = lerp2d_scalar(49, a, 7, b, 7, c, 1, d); | |
| 372 | 518 | dst_row0[dst_x + 1] = lerp2d_scalar(35, a, 21, b, 5, c, 3, d); | |
| 373 | 518 | dst_row0[dst_x + 2] = lerp2d_scalar(21, a, 35, b, 3, c, 5, d); | |
| 374 | 518 | dst_row0[dst_x + 3] = lerp2d_scalar(7, a, 49, b, 1, c, 7, d); | |
| 375 | 518 | dst_row1[dst_x + 0] = lerp2d_scalar(35, a, 5, b, 21, c, 3, d); | |
| 376 | 518 | dst_row1[dst_x + 1] = lerp2d_scalar(25, a, 15, b, 15, c, 9, d); | |
| 377 | 518 | dst_row1[dst_x + 2] = lerp2d_scalar(15, a, 25, b, 9, c, 15, d); | |
| 378 | 518 | dst_row1[dst_x + 3] = lerp2d_scalar(5, a, 35, b, 3, c, 21, d); | |
| 379 | 518 | dst_row2[dst_x + 0] = lerp2d_scalar(21, a, 3, b, 35, c, 5, d); | |
| 380 | 518 | dst_row2[dst_x + 1] = lerp2d_scalar(15, a, 9, b, 25, c, 15, d); | |
| 381 | 518 | dst_row2[dst_x + 2] = lerp2d_scalar(9, a, 15, b, 15, c, 25, d); | |
| 382 | 518 | dst_row2[dst_x + 3] = lerp2d_scalar(3, a, 21, b, 5, c, 35, d); | |
| 383 | 518 | dst_row3[dst_x + 0] = lerp2d_scalar(7, a, 1, b, 49, c, 7, d); | |
| 384 | 518 | dst_row3[dst_x + 1] = lerp2d_scalar(5, a, 3, b, 35, c, 21, d); | |
| 385 | 518 | dst_row3[dst_x + 2] = lerp2d_scalar(3, a, 5, b, 21, c, 35, d); | |
| 386 | 518 | dst_row3[dst_x + 3] = lerp2d_scalar(1, a, 7, b, 7, c, 49, d); | |
| 387 | 518 | } | |
| 388 | 210 | }; | |
| 389 | |||
| 390 | // Top rows | ||
| 391 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
|
58 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 392 | 41 | process_edge_row(src, dst); | |
| 393 | 41 | memcpy(dst + dst_stride, dst, dst_stride); | |
| 394 | 41 | } | |
| 395 | |||
| 396 | // Middle rows | ||
| 397 |
2/2✓ Branch 0 taken 210 times.
✓ Branch 1 taken 58 times.
|
268 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 398 | 210 | size_t dst_y = src_y * 4 + 2; | |
| 399 | 210 | const uint8_t *src_row0 = src + src_stride * src_y; | |
| 400 | 210 | const uint8_t *src_row1 = src_row0 + src_stride; | |
| 401 | 210 | uint8_t *dst_row0 = dst + dst_stride * dst_y; | |
| 402 | 210 | uint8_t *dst_row1 = dst_row0 + dst_stride; | |
| 403 | 210 | uint8_t *dst_row2 = dst_row1 + dst_stride; | |
| 404 | 210 | uint8_t *dst_row3 = dst_row2 + dst_stride; | |
| 405 | |||
| 406 | 210 | process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); | |
| 407 | 210 | } | |
| 408 | |||
| 409 | // Bottom rows | ||
| 410 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
|
58 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 411 | 82 | process_edge_row(src + src_stride * (src_height - 1), | |
| 412 | 41 | dst + dst_stride * (dst_height - 2)); | |
| 413 | 123 | memcpy(dst + dst_stride * (dst_height - 1), | |
| 414 | 82 | dst + dst_stride * (dst_height - 2), dst_stride); | |
| 415 | 41 | } | |
| 416 | |||
| 417 | 58 | return KLEIDICV_OK; | |
| 418 | 58 | } | |
| 419 | |||
| 420 | 74 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32( | |
| 421 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 422 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
| 423 | 74 | size_t dst_width = src_width * 2; | |
| 424 | 74 | src_stride /= sizeof(float); | |
| 425 | 74 | dst_stride /= sizeof(float); | |
| 426 | |||
| 427 | 1274 | auto lerp1d_scalar = [](float near, float far) { | |
| 428 | 1200 | return near * 0.75F + far * 0.25F; | |
| 429 | }; | ||
| 430 | |||
| 431 | 2346 | auto lerp1d_vector = [](float32x4_t near, float32x4_t far) { | |
| 432 | 2272 | return vmlaq_n_f32(vmulq_n_f32(near, 0.75F), far, 0.25F); | |
| 433 | }; | ||
| 434 | |||
| 435 | 2162 | auto lerp2d_scalar = [](float near, float mid_a, float mid_b, float far) { | |
| 436 | 2088 | return near * 0.5625F + mid_a * 0.1875F + mid_b * 0.1875F + far * 0.0625F; | |
| 437 | }; | ||
| 438 | |||
| 439 | 9690 | auto lerp2d_vector = [](float32x4_t a, float32x4_t b, float32x4_t c, | |
| 440 | float32x4_t d) { | ||
| 441 | 9616 | return vmlaq_n_f32( | |
| 442 | 9616 | vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, 0.5625F), b, 0.1875F), c, | |
| 443 | 0.1875F), | ||
| 444 | 9616 | d, 0.0625F); | |
| 445 | }; | ||
| 446 | |||
| 447 | // Handle top or bottom edge | ||
| 448 | 188 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 449 | const float *src_row, float *dst_row) { | ||
| 450 | // Left element | ||
| 451 | 114 | dst_row[0] = src_row[0]; | |
| 452 | |||
| 453 | // Right element | ||
| 454 | 114 | dst_row[dst_width - 1] = src_row[src_width - 1]; | |
| 455 | |||
| 456 | // Middle elements | ||
| 457 | 114 | size_t src_x = 0; | |
| 458 |
2/2✓ Branch 0 taken 1136 times.
✓ Branch 1 taken 114 times.
|
1250 | for (; src_x + 4 < src_width; src_x += 4) { |
| 459 | 1136 | size_t dst_x = src_x * 2 + 1; | |
| 460 | 1136 | float32x4_t src_left = vld1q_f32(src_row + src_x); | |
| 461 | 1136 | float32x4_t src_right = vld1q_f32(src_row + src_x + 1); | |
| 462 | |||
| 463 | 1136 | float32x4_t dst_left = lerp1d_vector(src_left, src_right); | |
| 464 | 1136 | float32x4_t dst_right = lerp1d_vector(src_right, src_left); | |
| 465 | |||
| 466 | 1136 | vst2q_f32(dst_row + dst_x, (float32x4x2_t{dst_left, dst_right})); | |
| 467 | 1136 | } | |
| 468 |
2/2✓ Branch 0 taken 140 times.
✓ Branch 1 taken 114 times.
|
254 | for (; src_x + 1 < src_width; ++src_x) { |
| 469 | 140 | size_t dst_x = src_x * 2 + 1; | |
| 470 | 140 | const float src_left = src_row[src_x], src_right = src_row[src_x + 1]; | |
| 471 | 140 | dst_row[dst_x] = lerp1d_scalar(src_left, src_right); | |
| 472 | 140 | dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left); | |
| 473 | 140 | } | |
| 474 | 114 | }; | |
| 475 | |||
| 476 | 304 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
| 477 | lerp2d_vector](const float *src_row0, | ||
| 478 | const float *src_row1, float *dst_row0, | ||
| 479 | float *dst_row1) { | ||
| 480 | // Left element | ||
| 481 | 230 | dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); | |
| 482 | 230 | dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); | |
| 483 | |||
| 484 | // Right element | ||
| 485 | 230 | dst_row0[dst_width - 1] = | |
| 486 | 230 | lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); | |
| 487 | 230 | dst_row1[dst_width - 1] = | |
| 488 | 230 | lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); | |
| 489 | |||
| 490 | // Middle elements | ||
| 491 | 230 | size_t src_x = 0; | |
| 492 |
2/2✓ Branch 0 taken 2404 times.
✓ Branch 1 taken 230 times.
|
2634 | for (; src_x + 4 < src_width; src_x += 4) { |
| 493 | 2404 | size_t dst_x = src_x * 2 + 1; | |
| 494 | |||
| 495 | 2404 | float32x4_t a = vld1q_f32(src_row0 + src_x); | |
| 496 | 2404 | float32x4_t b = vld1q_f32(src_row0 + src_x + 1); | |
| 497 | 2404 | float32x4_t c = vld1q_f32(src_row1 + src_x); | |
| 498 | 2404 | float32x4_t d = vld1q_f32(src_row1 + src_x + 1); | |
| 499 | |||
| 500 | 4808 | vst2q_f32(dst_row0 + dst_x, (float32x4x2_t{lerp2d_vector(a, b, c, d), | |
| 501 | 2404 | lerp2d_vector(b, a, d, c)})); | |
| 502 | 4808 | vst2q_f32(dst_row1 + dst_x, (float32x4x2_t{lerp2d_vector(c, a, d, b), | |
| 503 | 2404 | lerp2d_vector(d, b, c, a)})); | |
| 504 | 2404 | } | |
| 505 |
2/2✓ Branch 0 taken 522 times.
✓ Branch 1 taken 230 times.
|
752 | for (; src_x + 1 < src_width; ++src_x) { |
| 506 | 522 | size_t dst_x = src_x * 2 + 1; | |
| 507 | 1044 | const float a = src_row0[src_x], b = src_row0[src_x + 1], | |
| 508 | 1044 | c = src_row1[src_x], d = src_row1[src_x + 1]; | |
| 509 | 522 | dst_row0[dst_x] = lerp2d_scalar(a, b, c, d); | |
| 510 | 522 | dst_row0[dst_x + 1] = lerp2d_scalar(b, a, d, c); | |
| 511 | 522 | dst_row1[dst_x] = lerp2d_scalar(c, a, d, b); | |
| 512 | 522 | dst_row1[dst_x + 1] = lerp2d_scalar(d, b, c, a); | |
| 513 | 522 | } | |
| 514 | 230 | }; | |
| 515 | |||
| 516 | // Top row | ||
| 517 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 57 times.
|
74 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 518 | 57 | process_edge_row(src, dst); | |
| 519 | 57 | } | |
| 520 | |||
| 521 | // Middle rows | ||
| 522 |
2/2✓ Branch 0 taken 230 times.
✓ Branch 1 taken 74 times.
|
304 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 523 | 230 | size_t dst_y = src_y * 2 + 1; | |
| 524 | 230 | const float *src_row0 = src + src_stride * src_y; | |
| 525 | 230 | const float *src_row1 = src_row0 + src_stride; | |
| 526 | 230 | float *dst_row0 = dst + dst_stride * dst_y; | |
| 527 | 230 | float *dst_row1 = dst_row0 + dst_stride; | |
| 528 | |||
| 529 | 230 | process_row(src_row0, src_row1, dst_row0, dst_row1); | |
| 530 | 230 | } | |
| 531 | |||
| 532 | // Bottom row | ||
| 533 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 57 times.
|
74 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 534 | 114 | process_edge_row(src + src_stride * (src_height - 1), | |
| 535 | 57 | dst + dst_stride * (src_height * 2 - 1)); | |
| 536 | 57 | } | |
| 537 | |||
| 538 | 74 | return KLEIDICV_OK; | |
| 539 | 74 | } | |
| 540 | |||
| 541 | 62 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( | |
| 542 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 543 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
| 544 | using T = float; | ||
| 545 | 62 | size_t dst_height = src_height * 4; | |
| 546 | 62 | size_t dst_width = src_width * 4; | |
| 547 | 62 | src_stride /= sizeof(T); | |
| 548 | 62 | dst_stride /= sizeof(T); | |
| 549 | |||
| 550 | 2174 | auto lerp1d_scalar = [](T coeff_a, T a, T coeff_b, T b) { | |
| 551 | 2112 | return coeff_a * a + coeff_b * b; | |
| 552 | }; | ||
| 553 | 23838 | auto lerp1d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b) { | |
| 554 | 23776 | return vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b); | |
| 555 | }; | ||
| 556 | 7646 | auto lerp2d_scalar = [](T coeff_a, T a, T coeff_b, T b, T coeff_c, T c, | |
| 557 | T coeff_d, T d) { | ||
| 558 | 7584 | return coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d; | |
| 559 | }; | ||
| 560 | 19294 | auto lerp2d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b, | |
| 561 | T coeff_c, float32x4_t c, T coeff_d, float32x4_t d) { | ||
| 562 | 19232 | return vmlaq_n_f32( | |
| 563 | 38464 | vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b), c, | |
| 564 | 19232 | coeff_c), | |
| 565 | 19232 | d, coeff_d); | |
| 566 | }; | ||
| 567 | // Handle top or bottom edge | ||
| 568 | 152 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 569 | const T *src_row, T *dst_row) { | ||
| 570 | // Left elements | ||
| 571 | 90 | dst_row[1] = dst_row[0] = src_row[0]; | |
| 572 | |||
| 573 | // Right elements | ||
| 574 | 90 | dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; | |
| 575 | |||
| 576 | // Middle elements | ||
| 577 | 90 | size_t src_x = 0; | |
| 578 |
2/2✓ Branch 0 taken 1136 times.
✓ Branch 1 taken 90 times.
|
1226 | for (; src_x + 4 < src_width; src_x += 4) { |
| 579 | 1136 | size_t dst_x = src_x * 4 + 2; | |
| 580 | 1136 | float32x4_t a = vld1q_f32(src_row + src_x); | |
| 581 | 1136 | float32x4_t b = vld1q_f32(src_row + src_x + 1); | |
| 582 | 2272 | vst4q_f32(dst_row + dst_x, | |
| 583 | 4544 | (float32x4x4_t{lerp1d_vector(0.875F, a, 0.125F, b), | |
| 584 | 1136 | lerp1d_vector(0.625F, a, 0.375F, b), | |
| 585 | 1136 | lerp1d_vector(0.375F, a, 0.625F, b), | |
| 586 | 1136 | lerp1d_vector(0.125F, a, 0.875F, b)})); | |
| 587 | 1136 | } | |
| 588 |
2/2✓ Branch 0 taken 100 times.
✓ Branch 1 taken 90 times.
|
190 | for (; src_x + 1 < src_width; ++src_x) { |
| 589 | 100 | size_t dst_x = src_x * 4 + 2; | |
| 590 | 100 | const T a = src_row[src_x], b = src_row[src_x + 1]; | |
| 591 | 100 | dst_row[dst_x + 0] = lerp1d_scalar(0.875F, a, 0.125F, b); | |
| 592 | 100 | dst_row[dst_x + 1] = lerp1d_scalar(0.625F, a, 0.375F, b); | |
| 593 | 100 | dst_row[dst_x + 2] = lerp1d_scalar(0.375F, a, 0.625F, b); | |
| 594 | 100 | dst_row[dst_x + 3] = lerp1d_scalar(0.125F, a, 0.875F, b); | |
| 595 | 100 | } | |
| 596 | 90 | }; | |
| 597 | |||
| 598 | 276 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector, | |
| 599 | lerp2d_scalar, lerp2d_vector]( | ||
| 600 | const T *src_row0, const T *src_row1, T *dst_row0, | ||
| 601 | T *dst_row1, T *dst_row2, T *dst_row3) { | ||
| 602 | // Left elements | ||
| 603 | 214 | const T s0l = src_row0[0], s1l = src_row1[0]; | |
| 604 | 214 | dst_row0[0] = dst_row0[1] = lerp1d_scalar(0.875F, s0l, 0.125F, s1l); | |
| 605 | 214 | dst_row1[0] = dst_row1[1] = lerp1d_scalar(0.625F, s0l, 0.375F, s1l); | |
| 606 | 214 | dst_row2[0] = dst_row2[1] = lerp1d_scalar(0.375F, s0l, 0.625F, s1l); | |
| 607 | 214 | dst_row3[0] = dst_row3[1] = lerp1d_scalar(0.125F, s0l, 0.875F, s1l); | |
| 608 | |||
| 609 | // Right elements | ||
| 610 | 214 | const T s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; | |
| 611 | 214 | const size_t dr0 = dst_width - 2; | |
| 612 | 214 | const size_t dr1 = dst_width - 1; | |
| 613 | 214 | dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(0.875F, s0r, 0.125F, s1r); | |
| 614 | 214 | dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(0.625F, s0r, 0.375F, s1r); | |
| 615 | 214 | dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(0.375F, s0r, 0.625F, s1r); | |
| 616 | 214 | dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(0.125F, s0r, 0.875F, s1r); | |
| 617 | |||
| 618 | // Middle elements | ||
| 619 | 214 | size_t src_x = 0; | |
| 620 |
2/2✓ Branch 0 taken 2404 times.
✓ Branch 1 taken 214 times.
|
2618 | for (; src_x + 4 < src_width; src_x += 4) { |
| 621 | 2404 | size_t dst_x = src_x * 4 + 2; | |
| 622 | |||
| 623 | 2404 | float32x4_t a = vld1q_f32(src_row0 + src_x); | |
| 624 | 2404 | float32x4_t b = vld1q_f32(src_row0 + src_x + 1); | |
| 625 | 2404 | float32x4_t c = vld1q_f32(src_row1 + src_x); | |
| 626 | 2404 | float32x4_t d = vld1q_f32(src_row1 + src_x + 1); | |
| 627 | |||
| 628 | 2404 | float32x4x4_t dst_a{ | |
| 629 | 9616 | lerp2d_vector(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d), | |
| 630 | 2404 | lerp2d_vector(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d), | |
| 631 | 2404 | lerp2d_vector(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d), | |
| 632 | 2404 | lerp2d_vector(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d), | |
| 633 | }; | ||
| 634 | 2404 | float32x4x4_t dst_d{ | |
| 635 | 9616 | lerp2d_vector(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d), | |
| 636 | 2404 | lerp2d_vector(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d), | |
| 637 | 2404 | lerp2d_vector(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d), | |
| 638 | 2404 | lerp2d_vector(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d), | |
| 639 | }; | ||
| 640 | 2404 | const float one_3rd = 0.3333333333333333F; | |
| 641 | 2404 | const float two_3rd = 0.6666666666666667F; | |
| 642 | 2404 | vst4q_f32(dst_row0 + dst_x, dst_a); | |
| 643 | 4808 | vst4q_f32(dst_row1 + dst_x, | |
| 644 | 2404 | (float32x4x4_t{ | |
| 645 | 9616 | lerp1d_vector(two_3rd, dst_a.val[0], one_3rd, dst_d.val[0]), | |
| 646 | 2404 | lerp1d_vector(two_3rd, dst_a.val[1], one_3rd, dst_d.val[1]), | |
| 647 | 2404 | lerp1d_vector(two_3rd, dst_a.val[2], one_3rd, dst_d.val[2]), | |
| 648 | 2404 | lerp1d_vector(two_3rd, dst_a.val[3], one_3rd, dst_d.val[3]), | |
| 649 | })); | ||
| 650 | 4808 | vst4q_f32(dst_row2 + dst_x, | |
| 651 | 2404 | (float32x4x4_t{ | |
| 652 | 9616 | lerp1d_vector(one_3rd, dst_a.val[0], two_3rd, dst_d.val[0]), | |
| 653 | 2404 | lerp1d_vector(one_3rd, dst_a.val[1], two_3rd, dst_d.val[1]), | |
| 654 | 2404 | lerp1d_vector(one_3rd, dst_a.val[2], two_3rd, dst_d.val[2]), | |
| 655 | 2404 | lerp1d_vector(one_3rd, dst_a.val[3], two_3rd, dst_d.val[3]), | |
| 656 | })); | ||
| 657 | 2404 | vst4q_f32(dst_row3 + dst_x, dst_d); | |
| 658 | 2404 | } | |
| 659 | |||
| 660 |
2/2✓ Branch 0 taken 474 times.
✓ Branch 1 taken 214 times.
|
688 | for (; src_x + 1 < src_width; ++src_x) { |
| 661 | 474 | size_t dst_x = src_x * 4 + 2; | |
| 662 | 474 | const T a = src_row0[src_x], b = src_row0[src_x + 1], c = src_row1[src_x], | |
| 663 | 474 | d = src_row1[src_x + 1]; | |
| 664 | |||
| 665 | 474 | dst_row0[dst_x + 0] = | |
| 666 | 474 | lerp2d_scalar(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d); | |
| 667 | 474 | dst_row0[dst_x + 1] = | |
| 668 | 474 | lerp2d_scalar(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d); | |
| 669 | 474 | dst_row0[dst_x + 2] = | |
| 670 | 474 | lerp2d_scalar(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d); | |
| 671 | 474 | dst_row0[dst_x + 3] = | |
| 672 | 474 | lerp2d_scalar(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d); | |
| 673 | 474 | dst_row1[dst_x + 0] = | |
| 674 | 474 | lerp2d_scalar(0.546875F, a, 0.078125F, b, 0.328125F, c, 0.046875F, d); | |
| 675 | 474 | dst_row1[dst_x + 1] = | |
| 676 | 474 | lerp2d_scalar(0.390625F, a, 0.234375F, b, 0.234375F, c, 0.140625F, d); | |
| 677 | 474 | dst_row1[dst_x + 2] = | |
| 678 | 474 | lerp2d_scalar(0.234375F, a, 0.390625F, b, 0.140625F, c, 0.234375F, d); | |
| 679 | 474 | dst_row1[dst_x + 3] = | |
| 680 | 474 | lerp2d_scalar(0.078125F, a, 0.546875F, b, 0.046875F, c, 0.328125F, d); | |
| 681 | 474 | dst_row2[dst_x + 0] = | |
| 682 | 474 | lerp2d_scalar(0.328125F, a, 0.046875F, b, 0.546875F, c, 0.078125F, d); | |
| 683 | 474 | dst_row2[dst_x + 1] = | |
| 684 | 474 | lerp2d_scalar(0.234375F, a, 0.140625F, b, 0.390625F, c, 0.234375F, d); | |
| 685 | 474 | dst_row2[dst_x + 2] = | |
| 686 | 474 | lerp2d_scalar(0.140625F, a, 0.234375F, b, 0.234375F, c, 0.390625F, d); | |
| 687 | 474 | dst_row2[dst_x + 3] = | |
| 688 | 474 | lerp2d_scalar(0.046875F, a, 0.328125F, b, 0.078125F, c, 0.546875F, d); | |
| 689 | 474 | dst_row3[dst_x + 0] = | |
| 690 | 474 | lerp2d_scalar(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d); | |
| 691 | 474 | dst_row3[dst_x + 1] = | |
| 692 | 474 | lerp2d_scalar(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d); | |
| 693 | 474 | dst_row3[dst_x + 2] = | |
| 694 | 474 | lerp2d_scalar(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d); | |
| 695 | 474 | dst_row3[dst_x + 3] = | |
| 696 | 474 | lerp2d_scalar(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d); | |
| 697 | 474 | } | |
| 698 | 214 | }; | |
| 699 | |||
| 700 | // Top rows | ||
| 701 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 45 times.
|
62 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 702 | 45 | process_edge_row(src, dst); | |
| 703 | 45 | memcpy(dst + dst_stride, dst, dst_stride * sizeof(T)); | |
| 704 | 45 | } | |
| 705 | |||
| 706 | // Middle rows | ||
| 707 |
2/2✓ Branch 0 taken 214 times.
✓ Branch 1 taken 62 times.
|
276 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 708 | 214 | size_t dst_y = src_y * 4 + 2; | |
| 709 | 214 | const T *src_row0 = src + src_stride * src_y; | |
| 710 | 214 | const T *src_row1 = src_row0 + src_stride; | |
| 711 | 214 | T *dst_row0 = dst + dst_stride * dst_y; | |
| 712 | 214 | T *dst_row1 = dst_row0 + dst_stride; | |
| 713 | 214 | T *dst_row2 = dst_row1 + dst_stride; | |
| 714 | 214 | T *dst_row3 = dst_row2 + dst_stride; | |
| 715 | |||
| 716 | 214 | process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); | |
| 717 | 214 | } | |
| 718 | |||
| 719 | // Bottom rows | ||
| 720 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 45 times.
|
62 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 721 | 90 | process_edge_row(src + src_stride * (src_height - 1), | |
| 722 | 45 | dst + dst_stride * (dst_height - 2)); | |
| 723 | 135 | memcpy(dst + dst_stride * (dst_height - 1), | |
| 724 | 90 | dst + dst_stride * (dst_height - 2), dst_stride * sizeof(T)); | |
| 725 | 45 | } | |
| 726 | |||
| 727 | 62 | return KLEIDICV_OK; | |
| 728 | 62 | } | |
| 729 | |||
| 730 | 58 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( | |
| 731 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 732 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
| 733 | 58 | size_t dst_width = src_width * 8; | |
| 734 | 58 | size_t dst_height = src_height * 8; | |
| 735 | 58 | src_stride /= sizeof(float); | |
| 736 | 58 | dst_stride /= sizeof(float); | |
| 737 | |||
| 738 | 58 | float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0, | |
| 739 | 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0}; | ||
| 740 | 58 | float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0, | |
| 741 | 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0}; | ||
| 742 | 58 | float32x4_t coeffs_a0 = vld1q_f32(&coeffs_a[0]); | |
| 743 | 58 | float32x4_t coeffs_a1 = vld1q_f32(&coeffs_a[4]); | |
| 744 | 58 | float32x4_t coeffs_b0 = vld1q_f32(&coeffs_b[0]); | |
| 745 | 58 | float32x4_t coeffs_b1 = vld1q_f32(&coeffs_b[4]); | |
| 746 | |||
| 747 | 3418 | auto lerp1d_vector_n = [](float p, float32x4_t a, float q, float32x4_t b) { | |
| 748 | 3360 | return vmlaq_n_f32(vmulq_n_f32(a, p), b, q); | |
| 749 | }; | ||
| 750 | |||
| 751 | 121090 | auto lerp1d_vector_n2 = [](float32x4_t a, float q, float32x4_t b) { | |
| 752 | 121032 | return vmlaq_n_f32(a, b, q); | |
| 753 | }; | ||
| 754 | |||
| 755 | 9330 | auto lerp1d_vector = [](float32x4_t p, float32x4_t a, float32x4_t q, | |
| 756 | float32x4_t b) { | ||
| 757 | 9272 | return vmlaq_f32(vmulq_f32(a, p), b, q); | |
| 758 | }; | ||
| 759 | |||
| 760 | // Handle top or bottom edge | ||
| 761 | 58 | auto process_edge_row = | |
| 762 | 140 | [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, | |
| 763 | &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) { | ||
| 764 | // Left elements | ||
| 765 | 82 | dst_row[3] = dst_row[2] = dst_row[1] = dst_row[0] = src_row[0]; | |
| 766 | 82 | dst_row[dst_stride + 3] = dst_row[dst_stride + 2] = | |
| 767 | 82 | dst_row[dst_stride + 1] = dst_row[dst_stride] = src_row[0]; | |
| 768 | 82 | dst_row[2 * dst_stride + 3] = dst_row[2 * dst_stride + 2] = | |
| 769 | 82 | dst_row[2 * dst_stride + 1] = dst_row[2 * dst_stride] = src_row[0]; | |
| 770 | 82 | dst_row[3 * dst_stride + 3] = dst_row[3 * dst_stride + 2] = | |
| 771 | 82 | dst_row[3 * dst_stride + 1] = dst_row[3 * dst_stride] = src_row[0]; | |
| 772 | |||
| 773 | // Right elements | ||
| 774 | 82 | float *dst_right = dst_row + dst_width - 4; | |
| 775 | 82 | dst_right[3] = dst_right[2] = dst_right[1] = dst_right[0] = | |
| 776 | 82 | src_row[src_width - 1]; | |
| 777 | 82 | dst_right[dst_stride + 3] = dst_right[dst_stride + 2] = | |
| 778 | 82 | dst_right[dst_stride + 1] = dst_right[dst_stride] = | |
| 779 | 82 | src_row[src_width - 1]; | |
| 780 | 82 | dst_right[2 * dst_stride + 3] = dst_right[2 * dst_stride + 2] = | |
| 781 | 82 | dst_right[2 * dst_stride + 1] = dst_right[2 * dst_stride] = | |
| 782 | 82 | src_row[src_width - 1]; | |
| 783 | 82 | dst_right[3 * dst_stride + 3] = dst_right[3 * dst_stride + 2] = | |
| 784 | 82 | dst_right[3 * dst_stride + 1] = dst_right[3 * dst_stride] = | |
| 785 | 82 | src_row[src_width - 1]; | |
| 786 | |||
| 787 | // Middle elements | ||
| 788 | 82 | float32x4_t a, b = vdupq_n_f32(src_row[0]); | |
| 789 |
2/2✓ Branch 0 taken 82 times.
✓ Branch 1 taken 4636 times.
|
4718 | for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { |
| 790 | 4636 | a = b; | |
| 791 | 4636 | b = vdupq_n_f32(src_row[src_x + 1]); | |
| 792 | 4636 | float *dst_row0 = dst_row + src_x * 8 + 4; | |
| 793 | 4636 | float *dst_row1 = dst_row0 + dst_stride; | |
| 794 | 4636 | float *dst_row2 = dst_row1 + dst_stride; | |
| 795 | 4636 | float *dst_row3 = dst_row2 + dst_stride; | |
| 796 | 4636 | float32x4_t dst = lerp1d_vector(coeffs_a0, a, coeffs_b0, b); | |
| 797 | 4636 | vst1q(dst_row0, dst); | |
| 798 | 4636 | vst1q(dst_row1, dst); | |
| 799 | 4636 | vst1q(dst_row2, dst); | |
| 800 | 4636 | vst1q(dst_row3, dst); | |
| 801 | 4636 | dst = lerp1d_vector(coeffs_a1, a, coeffs_b1, b); | |
| 802 | 4636 | vst1q(dst_row0 + 4, dst); | |
| 803 | 4636 | vst1q(dst_row1 + 4, dst); | |
| 804 | 4636 | vst1q(dst_row2 + 4, dst); | |
| 805 | 4636 | vst1q(dst_row3 + 4, dst); | |
| 806 | 4636 | } | |
| 807 | 82 | }; | |
| 808 | |||
| 809 | 58 | float32x4_t coeffs_p0 = vmulq_n_f32(coeffs_a0, 15.0 / 16); | |
| 810 | 58 | float32x4_t coeffs_q0 = vmulq_n_f32(coeffs_b0, 15.0 / 16); | |
| 811 | 58 | float32x4_t coeffs_r0 = vmulq_n_f32(coeffs_a0, 1.0 / 16); | |
| 812 | 58 | float32x4_t coeffs_s0 = vmulq_n_f32(coeffs_b0, 1.0 / 16); | |
| 813 | 58 | float32x4_t coeffs_p1 = vmulq_n_f32(coeffs_a1, 15.0 / 16); | |
| 814 | 58 | float32x4_t coeffs_q1 = vmulq_n_f32(coeffs_b1, 15.0 / 16); | |
| 815 | 58 | float32x4_t coeffs_r1 = vmulq_n_f32(coeffs_a1, 1.0 / 16); | |
| 816 | 58 | float32x4_t coeffs_s1 = vmulq_n_f32(coeffs_b1, 1.0 / 16); | |
| 817 | |||
| 818 | 40402 | auto lerp2d_vector = [](float32x4_t a, float32x4_t p, float32x4_t b, | |
| 819 | float32x4_t q, float32x4_t c, float32x4_t r, | ||
| 820 | float32x4_t d, float32x4_t s) { | ||
| 821 | 40344 | return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(a, p), b, q), c, r), d, s); | |
| 822 | }; | ||
| 823 | |||
| 824 | 268 | auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n, | |
| 825 | lerp1d_vector_n2, &coeffs_p0, &coeffs_q0, &coeffs_r0, | ||
| 826 | &coeffs_s0, &coeffs_p1, &coeffs_q1, &coeffs_r1, | ||
| 827 | &coeffs_s1](const float *src_row0, const float *src_row1, | ||
| 828 | float *dst_row0, size_t dst_stride) { | ||
| 829 | // Left elements | ||
| 830 | 210 | float32x4_t s0 = vdupq_n_f32(src_row0[0]); | |
| 831 | 210 | float32x4_t s1 = vdupq_n_f32(src_row1[0]); | |
| 832 | 210 | float *dst_row = dst_row0; | |
| 833 |
2/2✓ Branch 0 taken 1680 times.
✓ Branch 1 taken 210 times.
|
1890 | for (size_t i = 0; i < 8; ++i) { |
| 834 | 3360 | vst1q(dst_row, | |
| 835 | 3360 | lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0, | |
| 836 | 1680 | static_cast<float>(i * 2 + 1) / 16.0F, s1)); | |
| 837 | 1680 | dst_row += dst_stride; | |
| 838 | 1680 | } | |
| 839 | |||
| 840 | // Middle elements | ||
| 841 | 210 | dst_row0 += 4; | |
| 842 | 210 | float *dst_row1 = dst_row0 + dst_stride; | |
| 843 | 210 | float *dst_row2 = dst_row1 + dst_stride; | |
| 844 | 210 | float *dst_row3 = dst_row2 + dst_stride; | |
| 845 | 210 | float *dst_row4 = dst_row3 + dst_stride; | |
| 846 | 210 | float *dst_row5 = dst_row4 + dst_stride; | |
| 847 | 210 | float *dst_row6 = dst_row5 + dst_stride; | |
| 848 | 210 | float *dst_row7 = dst_row6 + dst_stride; | |
| 849 | 210 | float32x4_t a, b = s0; | |
| 850 | 210 | float32x4_t c, d = s1; | |
| 851 |
2/2✓ Branch 0 taken 10086 times.
✓ Branch 1 taken 210 times.
|
10296 | for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { |
| 852 | 10086 | KLEIDICV_PREFETCH(dst_row0 + 64); | |
| 853 | 10086 | KLEIDICV_PREFETCH(dst_row1 + 64); | |
| 854 | 10086 | KLEIDICV_PREFETCH(dst_row2 + 64); | |
| 855 | 10086 | KLEIDICV_PREFETCH(dst_row3 + 64); | |
| 856 | 10086 | KLEIDICV_PREFETCH(dst_row4 + 64); | |
| 857 | 10086 | KLEIDICV_PREFETCH(dst_row5 + 64); | |
| 858 | 10086 | KLEIDICV_PREFETCH(dst_row6 + 64); | |
| 859 | 10086 | KLEIDICV_PREFETCH(dst_row7 + 64); | |
| 860 | 10086 | a = b; | |
| 861 | 10086 | b = vdupq_n_f32(src_row0[src_x + 1]); | |
| 862 | 10086 | c = d; | |
| 863 | 10086 | d = vdupq_n_f32(src_row1[src_x + 1]); | |
| 864 | 10086 | float32x4x2_t dst_0; | |
| 865 | 10086 | dst_0.val[0] = | |
| 866 | 10086 | lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d); | |
| 867 | 10086 | dst_0.val[1] = | |
| 868 | 10086 | lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d); | |
| 869 | |||
| 870 | 10086 | neon::VecTraits<float>::store(dst_0, dst_row0); | |
| 871 | 10086 | float32x4x2_t dst_7; | |
| 872 | 10086 | dst_7.val[0] = | |
| 873 | 10086 | lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d); | |
| 874 | 10086 | dst_7.val[1] = | |
| 875 | 10086 | lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d); | |
| 876 | |||
| 877 | 10086 | neon::VecTraits<float>::store(dst_7, dst_row7); | |
| 878 | 10086 | float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]); | |
| 879 | 10086 | float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]); | |
| 880 | |||
| 881 | 10086 | float32x4x2_t dst; | |
| 882 | 10086 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0); | |
| 883 | 10086 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1); | |
| 884 | |||
| 885 | 10086 | neon::VecTraits<float>::store(dst, dst_row1); | |
| 886 | 10086 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0); | |
| 887 | 10086 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1); | |
| 888 | |||
| 889 | 10086 | neon::VecTraits<float>::store(dst, dst_row2); | |
| 890 | 10086 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0); | |
| 891 | 10086 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1); | |
| 892 | |||
| 893 | 10086 | neon::VecTraits<float>::store(dst, dst_row3); | |
| 894 | 10086 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0); | |
| 895 | 10086 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1); | |
| 896 | |||
| 897 | 10086 | neon::VecTraits<float>::store(dst, dst_row4); | |
| 898 | 10086 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0); | |
| 899 | 10086 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1); | |
| 900 | |||
| 901 | 10086 | neon::VecTraits<float>::store(dst, dst_row5); | |
| 902 | 10086 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0); | |
| 903 | 10086 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1); | |
| 904 | |||
| 905 | 10086 | neon::VecTraits<float>::store(dst, dst_row6); | |
| 906 | 10086 | dst_row0 += 8; | |
| 907 | 10086 | dst_row1 += 8; | |
| 908 | 10086 | dst_row2 += 8; | |
| 909 | 10086 | dst_row3 += 8; | |
| 910 | 10086 | dst_row4 += 8; | |
| 911 | 10086 | dst_row5 += 8; | |
| 912 | 10086 | dst_row6 += 8; | |
| 913 | 10086 | dst_row7 += 8; | |
| 914 | 10086 | } | |
| 915 | |||
| 916 | // Right elements | ||
| 917 | 210 | s0 = b; | |
| 918 | 210 | s1 = d; | |
| 919 | 210 | dst_row = dst_row0; | |
| 920 |
2/2✓ Branch 0 taken 210 times.
✓ Branch 1 taken 1680 times.
|
1890 | for (size_t i = 0; i < 8; ++i) { |
| 921 | 3360 | vst1q(dst_row, | |
| 922 | 3360 | lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0, | |
| 923 | 1680 | static_cast<float>(i * 2 + 1) / 16.0F, s1)); | |
| 924 | 1680 | dst_row += dst_stride; | |
| 925 | 1680 | } | |
| 926 | 210 | }; | |
| 927 | |||
| 928 | // Top rows | ||
| 929 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
|
58 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 930 | 41 | process_edge_row(src, dst, dst_stride); | |
| 931 | 41 | } | |
| 932 | |||
| 933 | // Middle rows | ||
| 934 |
2/2✓ Branch 0 taken 210 times.
✓ Branch 1 taken 58 times.
|
268 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 935 | 210 | size_t dst_y = src_y * 8 + 4; | |
| 936 | 210 | const float *src_row0 = src + src_stride * src_y; | |
| 937 | 210 | const float *src_row1 = src_row0 + src_stride; | |
| 938 | 210 | process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride); | |
| 939 | 210 | } | |
| 940 | |||
| 941 | // Bottom rows | ||
| 942 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 41 times.
|
58 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 943 | 82 | process_edge_row(src + src_stride * (src_height - 1), | |
| 944 | 41 | dst + dst_stride * (dst_height - 4), dst_stride); | |
| 945 | 41 | } | |
| 946 | |||
| 947 | 58 | return KLEIDICV_OK; | |
| 948 | 58 | } | |
| 949 | |||
| 950 | 196 | kleidicv_error_t kleidicv_resize_linear_stripe_f32( | |
| 951 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 952 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride, | ||
| 953 | size_t dst_width, size_t dst_height) { | ||
| 954 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 195 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 195 times.
|
196 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 955 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 194 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 194 times.
|
195 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
| 956 | |||
| 957 |
2/4✓ Branch 0 taken 194 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 194 times.
|
194 | if (src_width == 0 || src_height == 0) { |
| 958 | ✗ | return KLEIDICV_OK; | |
| 959 | } | ||
| 960 |
3/4✓ Branch 0 taken 74 times.
✓ Branch 1 taken 120 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 74 times.
|
194 | if (src_width * 2 == dst_width && src_height * 2 == dst_height) { |
| 961 | 148 | return resize_2x2_f32(src, src_stride, src_width, src_height, y_begin, | |
| 962 | 74 | y_end, dst, dst_stride); | |
| 963 | } | ||
| 964 |
3/4✓ Branch 0 taken 62 times.
✓ Branch 1 taken 58 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 62 times.
|
120 | if (src_width * 4 == dst_width && src_height * 4 == dst_height) { |
| 965 | 124 | return resize_4x4_f32(src, src_stride, src_width, src_height, y_begin, | |
| 966 | 62 | y_end, dst, dst_stride); | |
| 967 | } | ||
| 968 |
2/4✓ Branch 0 taken 58 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 58 times.
|
58 | if (src_width * 8 == dst_width && src_height * 8 == dst_height) { |
| 969 | 116 | return resize_8x8_f32(src, src_stride, src_width, src_height, y_begin, | |
| 970 | 58 | y_end, dst, dst_stride); | |
| 971 | } | ||
| 972 | // resize_linear_f32_is_implemented checked the kernel size already. | ||
| 973 | // GCOVR_EXCL_START | ||
| 974 | assert(!"resize ratio not implemented"); | ||
| 975 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 976 | // GCOVR_EXCL_STOP | ||
| 977 | 196 | } | |
| 978 | } // namespace kleidicv::neon | ||
| 979 |