| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cassert> | ||
| 6 | |||
| 7 | #include "kleidicv/kleidicv.h" | ||
| 8 | #include "kleidicv/neon.h" | ||
| 9 | #include "kleidicv/operations.h" | ||
| 10 | #include "kleidicv/resize/resize_linear.h" | ||
| 11 | |||
| 12 | namespace kleidicv::neon { | ||
| 13 | |||
| 14 | template <uint8_t P, uint8_t Q, uint8_t Bias, uint8_t Shift> | ||
| 15 | 8848 | uint8x8_t lerp2d_vector_p_q_q_1(uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 16 | uint8x8_t d) { | ||
| 17 | // b + c | ||
| 18 | 8848 | uint16x8_t b_c = vaddl_u8(b, c); | |
| 19 | |||
| 20 | // a * p | ||
| 21 | 8848 | uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); | |
| 22 | |||
| 23 | // a * p + (b + c) * q | ||
| 24 | 8848 | uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); | |
| 25 | |||
| 26 | // d + bias | ||
| 27 | 8848 | uint16x8_t d_bias = vaddl_u8(d, vdup_n_u8(Bias)); | |
| 28 | |||
| 29 | // a * p + (b + c) * q + d + bias | ||
| 30 | 8848 | uint16x8_t ap_bcq_d_bias = vaddq_u16(ap_bcq, d_bias); | |
| 31 | |||
| 32 | // (a * p + (b + c) * q + d + bias) >> shift | ||
| 33 | 8848 | uint8x8_t result = vshrn_n_u16(ap_bcq_d_bias, Shift); | |
| 34 | 17696 | return result; | |
| 35 | 8848 | } | |
| 36 | |||
| 37 | template <uint8_t P, uint8_t Q, uint8_t R, uint8_t Bias, uint8_t Shift> | ||
| 38 | 4424 | uint8x8_t lerp2d_vector_p_q_q_r(uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 39 | uint8x8_t d) { | ||
| 40 | // b + c | ||
| 41 | 4424 | uint16x8_t b_c = vaddl_u8(b, c); | |
| 42 | |||
| 43 | // a * p | ||
| 44 | 4424 | uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); | |
| 45 | |||
| 46 | // d * r | ||
| 47 | 4424 | uint16x8_t dr = vmull_u8(d, vdup_n_u8(R)); | |
| 48 | |||
| 49 | // a * p + (b + c) * q | ||
| 50 | 4424 | uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); | |
| 51 | |||
| 52 | // d * r + bias | ||
| 53 | 4424 | uint16x8_t dr_bias = vaddq_u16(dr, vdupq_n_u16(Bias)); | |
| 54 | |||
| 55 | // a * p + (b + c) * q + d * r + bias | ||
| 56 | 4424 | uint16x8_t ap_bcq_dr_bias = vaddq_u16(ap_bcq, dr_bias); | |
| 57 | |||
| 58 | // (a * p + (b + c) * q + d * r + bias) >> shift | ||
| 59 | 4424 | uint8x8_t result = vshrn_n_u16(ap_bcq_dr_bias, Shift); | |
| 60 | 8848 | return result; | |
| 61 | 4424 | } | |
| 62 | |||
| 63 | 54 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8( | |
| 64 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 65 | size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { | ||
| 66 | 54 | size_t dst_width = src_width * 2; | |
| 67 | |||
| 68 | 926 | auto lerp1d_scalar = [](uint8_t near, uint8_t far) { | |
| 69 | 872 | return (near * 3 + far + 2) >> 2; | |
| 70 | }; | ||
| 71 | |||
| 72 | 1150 | auto lerp1d_vector = [](uint8x8_t near, uint8x8_t far) { | |
| 73 | 1096 | uint8x8_t three = vdup_n_u8(3); | |
| 74 | 1096 | uint8x8_t two = vdup_n_u8(2); | |
| 75 | |||
| 76 | // near * 3 | ||
| 77 | 1096 | uint16x8_t near3 = vmull_u8(near, three); | |
| 78 | |||
| 79 | // far + 2 | ||
| 80 | 1096 | uint16x8_t far_2 = vaddl_u8(far, two); | |
| 81 | |||
| 82 | // near * 3 + far * 2 | ||
| 83 | 1096 | uint16x8_t near3_far_2 = vaddq_u16(near3, far_2); | |
| 84 | |||
| 85 | // (near * 3 + far * 2) / 4 | ||
| 86 | 1096 | uint8x8_t near3_far_2_div4 = vshrn_n_u16(near3_far_2, 2); | |
| 87 | |||
| 88 | 2192 | return near3_far_2_div4; | |
| 89 | 1096 | }; | |
| 90 | |||
| 91 | 1382 | auto lerp2d_scalar = [](uint8_t near, uint8_t mid_a, uint8_t mid_b, | |
| 92 | uint8_t far) { | ||
| 93 | 1328 | return (near * 9 + (mid_a + mid_b) * 3 + far + 8) >> 4; | |
| 94 | }; | ||
| 95 | |||
| 96 | 4478 | auto lerp2d_vector = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, uint8x8_t d) { | |
| 97 | 4424 | return lerp2d_vector_p_q_q_1<9, 3, 8, 4>(a, b, c, d); | |
| 98 | }; | ||
| 99 | |||
| 100 | // Handle top or bottom edge | ||
| 101 | 160 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 102 | const uint8_t *src_row, uint8_t *dst_row) { | ||
| 103 | // Left element | ||
| 104 | 106 | dst_row[0] = src_row[0]; | |
| 105 | |||
| 106 | // Right element | ||
| 107 | 106 | dst_row[dst_width - 1] = src_row[src_width - 1]; | |
| 108 | |||
| 109 | // Middle elements | ||
| 110 | 106 | size_t src_x = 0; | |
| 111 |
2/2✓ Branch 0 taken 548 times.
✓ Branch 1 taken 106 times.
|
654 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 112 | 548 | size_t dst_x = src_x * 2 + 1; | |
| 113 | 548 | uint8x8_t src_left = vld1_u8(src_row + src_x); | |
| 114 | 548 | uint8x8_t src_right = vld1_u8(src_row + src_x + 1); | |
| 115 | |||
| 116 | 548 | uint8x8_t dst_left = lerp1d_vector(src_left, src_right); | |
| 117 | 548 | uint8x8_t dst_right = lerp1d_vector(src_right, src_left); | |
| 118 | |||
| 119 | 548 | vst2_u8(dst_row + dst_x, (uint8x8x2_t{dst_left, dst_right})); | |
| 120 | 548 | } | |
| 121 |
2/2✓ Branch 0 taken 140 times.
✓ Branch 1 taken 106 times.
|
246 | for (; src_x + 1 < src_width; ++src_x) { |
| 122 | 140 | size_t dst_x = src_x * 2 + 1; | |
| 123 | 140 | const uint8_t src_left = src_row[src_x], src_right = src_row[src_x + 1]; | |
| 124 | 140 | dst_row[dst_x] = lerp1d_scalar(src_left, src_right); | |
| 125 | 140 | dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left); | |
| 126 | 140 | } | |
| 127 | 106 | }; | |
| 128 | |||
| 129 | 202 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
| 130 | lerp2d_vector](const uint8_t *src_row0, | ||
| 131 | const uint8_t *src_row1, uint8_t *dst_row0, | ||
| 132 | uint8_t *dst_row1) { | ||
| 133 | // Left element | ||
| 134 | 148 | dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); | |
| 135 | 148 | dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); | |
| 136 | |||
| 137 | // Right element | ||
| 138 | 148 | dst_row0[dst_width - 1] = | |
| 139 | 148 | lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); | |
| 140 | 148 | dst_row1[dst_width - 1] = | |
| 141 | 148 | lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); | |
| 142 | |||
| 143 | // Middle elements | ||
| 144 | 148 | size_t src_x = 0; | |
| 145 |
2/2✓ Branch 0 taken 1106 times.
✓ Branch 1 taken 148 times.
|
1254 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 146 | 1106 | size_t dst_x = src_x * 2 + 1; | |
| 147 | |||
| 148 | 1106 | uint8x8_t src_tl = vld1_u8(src_row0 + src_x); | |
| 149 | 1106 | uint8x8_t src_tr = vld1_u8(src_row0 + src_x + 1); | |
| 150 | 1106 | uint8x8_t src_bl = vld1_u8(src_row1 + src_x); | |
| 151 | 1106 | uint8x8_t src_br = vld1_u8(src_row1 + src_x + 1); | |
| 152 | |||
| 153 | 1106 | uint8x8_t dst_tl = lerp2d_vector(src_tl, src_tr, src_bl, src_br); | |
| 154 | 1106 | uint8x8_t dst_tr = lerp2d_vector(src_tr, src_tl, src_br, src_bl); | |
| 155 | 1106 | uint8x8_t dst_bl = lerp2d_vector(src_bl, src_tl, src_br, src_tr); | |
| 156 | 1106 | uint8x8_t dst_br = lerp2d_vector(src_br, src_tr, src_bl, src_tl); | |
| 157 | |||
| 158 | 1106 | vst2_u8(dst_row0 + dst_x, (uint8x8x2_t{dst_tl, dst_tr})); | |
| 159 | 1106 | vst2_u8(dst_row1 + dst_x, (uint8x8x2_t{dst_bl, dst_br})); | |
| 160 | 1106 | } | |
| 161 |
2/2✓ Branch 0 taken 332 times.
✓ Branch 1 taken 148 times.
|
480 | for (; src_x + 1 < src_width; ++src_x) { |
| 162 | 332 | size_t dst_x = src_x * 2 + 1; | |
| 163 | 664 | const uint8_t src_tl = src_row0[src_x], src_tr = src_row0[src_x + 1], | |
| 164 | 664 | src_bl = src_row1[src_x], src_br = src_row1[src_x + 1]; | |
| 165 | 332 | dst_row0[dst_x] = lerp2d_scalar(src_tl, src_tr, src_bl, src_br); | |
| 166 | 332 | dst_row0[dst_x + 1] = lerp2d_scalar(src_tr, src_tl, src_br, src_bl); | |
| 167 | 332 | dst_row1[dst_x] = lerp2d_scalar(src_bl, src_tl, src_br, src_tr); | |
| 168 | 332 | dst_row1[dst_x + 1] = lerp2d_scalar(src_br, src_tr, src_bl, src_tl); | |
| 169 | 332 | } | |
| 170 | 148 | }; | |
| 171 | |||
| 172 | // Top row | ||
| 173 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 53 times.
|
54 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 174 | 53 | process_edge_row(src, dst); | |
| 175 | 53 | } | |
| 176 | |||
| 177 | // Middle rows | ||
| 178 |
2/2✓ Branch 0 taken 148 times.
✓ Branch 1 taken 54 times.
|
202 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 179 | 148 | size_t dst_y = src_y * 2 + 1; | |
| 180 | 148 | const uint8_t *src_row0 = src + src_stride * src_y; | |
| 181 | 148 | const uint8_t *src_row1 = src_row0 + src_stride; | |
| 182 | 148 | uint8_t *dst_row0 = dst + dst_stride * dst_y; | |
| 183 | 148 | uint8_t *dst_row1 = dst_row0 + dst_stride; | |
| 184 | |||
| 185 | 148 | process_row(src_row0, src_row1, dst_row0, dst_row1); | |
| 186 | 148 | } | |
| 187 | |||
| 188 | // Bottom row | ||
| 189 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 53 times.
|
54 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 190 | 106 | process_edge_row(src + src_stride * (src_height - 1), | |
| 191 | 53 | dst + dst_stride * (src_height * 2 - 1)); | |
| 192 | 53 | } | |
| 193 | |||
| 194 | 54 | return KLEIDICV_OK; | |
| 195 | 54 | } | |
| 196 | |||
| 197 | 34 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8( | |
| 198 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 199 | size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { | ||
| 200 | 34 | size_t dst_width = src_width * 4, dst_height = src_height * 4; | |
| 201 | |||
| 202 | 1394 | auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, | |
| 203 | uint8_t b) { | ||
| 204 | 1360 | return (coeff_a * a + coeff_b * b + 4) >> 3; | |
| 205 | }; | ||
| 206 | 2226 | auto lerp1d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, | |
| 207 | uint8_t coeff_b_scalar, uint8x8_t b) { | ||
| 208 | 2192 | uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); | |
| 209 | 2192 | uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); | |
| 210 | 2192 | uint16x8_t four = vdupq_n_u16(4); | |
| 211 | |||
| 212 | // a * coeff_a | ||
| 213 | 2192 | uint16x8_t a1 = vmull_u8(a, coeff_a); | |
| 214 | |||
| 215 | // b * coeff_b | ||
| 216 | 2192 | uint16x8_t b1 = vmull_u8(b, coeff_b); | |
| 217 | |||
| 218 | // a * coeff_a + b * coeff_b | ||
| 219 | 2192 | uint16x8_t a1_b1 = vaddq_u16(a1, b1); | |
| 220 | |||
| 221 | // a * coeff_a + b * coeff_b + 4 | ||
| 222 | 2192 | uint16x8_t a1_b1_4 = vaddq_u16(a1_b1, four); | |
| 223 | |||
| 224 | // (a * coeff_a + b * coeff_b + 4) / 8 | ||
| 225 | 2192 | uint8x8_t result = vshrn_n_u16(a1_b1_4, 3); | |
| 226 | |||
| 227 | 4384 | return result; | |
| 228 | 2192 | }; | |
| 229 | 4514 | auto lerp2d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, | |
| 230 | uint8_t b, uint8_t coeff_c, uint8_t c, | ||
| 231 | uint8_t coeff_d, uint8_t d) { | ||
| 232 | 4480 | return (coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d + 32) >> 6; | |
| 233 | }; | ||
| 234 | 8882 | auto lerp2d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, | |
| 235 | uint8_t coeff_b_scalar, uint8x8_t b, | ||
| 236 | uint8_t coeff_c_scalar, uint8x8_t c, | ||
| 237 | uint8_t coeff_d_scalar, uint8x8_t d) { | ||
| 238 | 8848 | uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); | |
| 239 | 8848 | uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); | |
| 240 | 8848 | uint8x8_t coeff_c = vdup_n_u8(coeff_c_scalar); | |
| 241 | 8848 | uint8x8_t coeff_d = vdup_n_u8(coeff_d_scalar); | |
| 242 | 8848 | uint16x8_t thirtytwo = vdupq_n_u16(32); | |
| 243 | |||
| 244 | // a * coeff_a | ||
| 245 | 8848 | uint16x8_t a1 = vmull_u8(a, coeff_a); | |
| 246 | |||
| 247 | // b * coeff_b | ||
| 248 | 8848 | uint16x8_t b1 = vmull_u8(b, coeff_b); | |
| 249 | |||
| 250 | // c * coeff_c | ||
| 251 | 8848 | uint16x8_t c1 = vmull_u8(c, coeff_c); | |
| 252 | |||
| 253 | // d * coeff_d | ||
| 254 | 8848 | uint16x8_t d1 = vmull_u8(d, coeff_d); | |
| 255 | |||
| 256 | // a * coeff_a + b * coeff_b | ||
| 257 | 8848 | uint16x8_t a1_b1 = vaddq_u16(a1, b1); | |
| 258 | |||
| 259 | // c * coeff_c + d * coeff_d | ||
| 260 | 8848 | uint16x8_t c1_d1 = vaddq_u16(c1, d1); | |
| 261 | |||
| 262 | // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d | ||
| 263 | 8848 | uint16x8_t a1_b1_c1_d1 = vaddq_u16(a1_b1, c1_d1); | |
| 264 | |||
| 265 | // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32 | ||
| 266 | 8848 | uint16x8_t a1_b1_c1_d1_32 = vaddq_u16(a1_b1_c1_d1, thirtytwo); | |
| 267 | |||
| 268 | // (a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32) / 64 | ||
| 269 | 8848 | uint8x8_t result = vshrn_n_u16(a1_b1_c1_d1_32, 6); | |
| 270 | 17696 | return result; | |
| 271 | 8848 | }; | |
| 272 | // Handle top or bottom edge | ||
| 273 | 100 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 274 | const uint8_t *src_row, uint8_t *dst_row) { | ||
| 275 | // Left elements | ||
| 276 | 66 | dst_row[1] = dst_row[0] = src_row[0]; | |
| 277 | |||
| 278 | // Right elements | ||
| 279 | 66 | dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; | |
| 280 | |||
| 281 | // Middle elements | ||
| 282 | 66 | size_t src_x = 0; | |
| 283 |
2/2✓ Branch 0 taken 548 times.
✓ Branch 1 taken 66 times.
|
614 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 284 | 548 | size_t dst_x = src_x * 4 + 2; | |
| 285 | 548 | uint8x8_t a = vld1_u8(src_row + src_x); | |
| 286 | 548 | uint8x8_t b = vld1_u8(src_row + src_x + 1); | |
| 287 | 548 | uint8x8x4_t interpolated = { | |
| 288 | 1644 | lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b), | |
| 289 | 1096 | lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)}; | |
| 290 | |||
| 291 | 548 | vst4_u8(dst_row + dst_x, interpolated); | |
| 292 | 548 | } | |
| 293 |
2/2✓ Branch 0 taken 84 times.
✓ Branch 1 taken 66 times.
|
150 | for (; src_x + 1 < src_width; ++src_x) { |
| 294 | 84 | size_t dst_x = src_x * 4 + 2; | |
| 295 | 84 | const uint8_t a = src_row[src_x], b = src_row[src_x + 1]; | |
| 296 | 84 | dst_row[dst_x + 0] = lerp1d_scalar(7, a, 1, b); | |
| 297 | 84 | dst_row[dst_x + 1] = lerp1d_scalar(5, a, 3, b); | |
| 298 | 84 | dst_row[dst_x + 2] = lerp1d_scalar(3, a, 5, b); | |
| 299 | 84 | dst_row[dst_x + 3] = lerp1d_scalar(1, a, 7, b); | |
| 300 | 84 | } | |
| 301 | 66 | }; | |
| 302 | |||
| 303 | 162 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
| 304 | lerp2d_vector](const uint8_t *src_row0, | ||
| 305 | const uint8_t *src_row1, uint8_t *dst_row0, | ||
| 306 | uint8_t *dst_row1, uint8_t *dst_row2, | ||
| 307 | uint8_t *dst_row3) { | ||
| 308 | 4552 | auto lerp2d_vector_49_7_7_1 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 309 | uint8x8_t d) { | ||
| 310 | 4424 | return lerp2d_vector_p_q_q_1<49, 7, 32, 6>(a, b, c, d); | |
| 311 | }; | ||
| 312 | 4552 | auto lerp2d_vector_25_15_15_9 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, | |
| 313 | uint8x8_t d) { | ||
| 314 | 4424 | return lerp2d_vector_p_q_q_r<25, 15, 9, 32, 6>(a, b, c, d); | |
| 315 | }; | ||
| 316 | |||
| 317 | // Left elements | ||
| 318 | 128 | const uint8_t s0l = src_row0[0], s1l = src_row1[0]; | |
| 319 | 128 | dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); | |
| 320 | 128 | dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); | |
| 321 | 128 | dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); | |
| 322 | 128 | dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); | |
| 323 | |||
| 324 | // Right elements | ||
| 325 | 128 | const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; | |
| 326 | 128 | const size_t dr0 = dst_width - 2; | |
| 327 | 128 | const size_t dr1 = dst_width - 1; | |
| 328 | 128 | dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); | |
| 329 | 128 | dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); | |
| 330 | 128 | dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); | |
| 331 | 128 | dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); | |
| 332 | |||
| 333 | // Middle elements | ||
| 334 | 128 | size_t src_x = 0; | |
| 335 |
2/2✓ Branch 0 taken 1106 times.
✓ Branch 1 taken 128 times.
|
1234 | for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { |
| 336 | 1106 | size_t dst_x = src_x * 4 + 2; | |
| 337 | |||
| 338 | 1106 | uint8x8_t a = vld1_u8(src_row0 + src_x); | |
| 339 | 1106 | uint8x8_t b = vld1_u8(src_row0 + src_x + 1); | |
| 340 | 1106 | uint8x8_t c = vld1_u8(src_row1 + src_x); | |
| 341 | 1106 | uint8x8_t d = vld1_u8(src_row1 + src_x + 1); | |
| 342 | |||
| 343 | 1106 | vst4_u8(dst_row0 + dst_x, (uint8x8x4_t{ | |
| 344 | lerp2d_vector_49_7_7_1(a, b, c, d), | ||
| 345 | lerp2d_vector(35, a, 21, b, 5, c, 3, d), | ||
| 346 | lerp2d_vector(21, a, 35, b, 3, c, 5, d), | ||
| 347 | lerp2d_vector_49_7_7_1(b, a, d, c), | ||
| 348 | })); | ||
| 349 | 1106 | vst4_u8(dst_row1 + dst_x, (uint8x8x4_t{ | |
| 350 | lerp2d_vector(35, a, 5, b, 21, c, 3, d), | ||
| 351 | lerp2d_vector_25_15_15_9(a, b, c, d), | ||
| 352 | lerp2d_vector_25_15_15_9(b, a, d, c), | ||
| 353 | lerp2d_vector(5, a, 35, b, 3, c, 21, d), | ||
| 354 | })); | ||
| 355 | 1106 | vst4_u8(dst_row2 + dst_x, (uint8x8x4_t{ | |
| 356 | lerp2d_vector(21, a, 3, b, 35, c, 5, d), | ||
| 357 | lerp2d_vector_25_15_15_9(c, a, d, b), | ||
| 358 | lerp2d_vector_25_15_15_9(d, b, c, a), | ||
| 359 | lerp2d_vector(3, a, 21, b, 5, c, 35, d), | ||
| 360 | })); | ||
| 361 | 1106 | vst4_u8(dst_row3 + dst_x, (uint8x8x4_t{ | |
| 362 | lerp2d_vector_49_7_7_1(c, a, d, b), | ||
| 363 | lerp2d_vector(5, a, 3, b, 35, c, 21, d), | ||
| 364 | lerp2d_vector(3, a, 5, b, 21, c, 35, d), | ||
| 365 | lerp2d_vector_49_7_7_1(d, b, c, a), | ||
| 366 | })); | ||
| 367 | 1106 | } | |
| 368 |
2/2✓ Branch 0 taken 280 times.
✓ Branch 1 taken 128 times.
|
408 | for (; src_x + 1 < src_width; ++src_x) { |
| 369 | 280 | size_t dst_x = src_x * 4 + 2; | |
| 370 | 560 | const uint8_t a = src_row0[src_x], b = src_row0[src_x + 1], | |
| 371 | 560 | c = src_row1[src_x], d = src_row1[src_x + 1]; | |
| 372 | |||
| 373 | 280 | dst_row0[dst_x + 0] = lerp2d_scalar(49, a, 7, b, 7, c, 1, d); | |
| 374 | 280 | dst_row0[dst_x + 1] = lerp2d_scalar(35, a, 21, b, 5, c, 3, d); | |
| 375 | 280 | dst_row0[dst_x + 2] = lerp2d_scalar(21, a, 35, b, 3, c, 5, d); | |
| 376 | 280 | dst_row0[dst_x + 3] = lerp2d_scalar(7, a, 49, b, 1, c, 7, d); | |
| 377 | 280 | dst_row1[dst_x + 0] = lerp2d_scalar(35, a, 5, b, 21, c, 3, d); | |
| 378 | 280 | dst_row1[dst_x + 1] = lerp2d_scalar(25, a, 15, b, 15, c, 9, d); | |
| 379 | 280 | dst_row1[dst_x + 2] = lerp2d_scalar(15, a, 25, b, 9, c, 15, d); | |
| 380 | 280 | dst_row1[dst_x + 3] = lerp2d_scalar(5, a, 35, b, 3, c, 21, d); | |
| 381 | 280 | dst_row2[dst_x + 0] = lerp2d_scalar(21, a, 3, b, 35, c, 5, d); | |
| 382 | 280 | dst_row2[dst_x + 1] = lerp2d_scalar(15, a, 9, b, 25, c, 15, d); | |
| 383 | 280 | dst_row2[dst_x + 2] = lerp2d_scalar(9, a, 15, b, 15, c, 25, d); | |
| 384 | 280 | dst_row2[dst_x + 3] = lerp2d_scalar(3, a, 21, b, 5, c, 35, d); | |
| 385 | 280 | dst_row3[dst_x + 0] = lerp2d_scalar(7, a, 1, b, 49, c, 7, d); | |
| 386 | 280 | dst_row3[dst_x + 1] = lerp2d_scalar(5, a, 3, b, 35, c, 21, d); | |
| 387 | 280 | dst_row3[dst_x + 2] = lerp2d_scalar(3, a, 5, b, 21, c, 35, d); | |
| 388 | 280 | dst_row3[dst_x + 3] = lerp2d_scalar(1, a, 7, b, 7, c, 49, d); | |
| 389 | 280 | } | |
| 390 | 128 | }; | |
| 391 | |||
| 392 | // Top rows | ||
| 393 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 394 | 33 | process_edge_row(src, dst); | |
| 395 | 33 | memcpy(dst + dst_stride, dst, dst_stride); | |
| 396 | 33 | } | |
| 397 | |||
| 398 | // Middle rows | ||
| 399 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
|
162 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 400 | 128 | size_t dst_y = src_y * 4 + 2; | |
| 401 | 128 | const uint8_t *src_row0 = src + src_stride * src_y; | |
| 402 | 128 | const uint8_t *src_row1 = src_row0 + src_stride; | |
| 403 | 128 | uint8_t *dst_row0 = dst + dst_stride * dst_y; | |
| 404 | 128 | uint8_t *dst_row1 = dst_row0 + dst_stride; | |
| 405 | 128 | uint8_t *dst_row2 = dst_row1 + dst_stride; | |
| 406 | 128 | uint8_t *dst_row3 = dst_row2 + dst_stride; | |
| 407 | |||
| 408 | 128 | process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); | |
| 409 | 128 | } | |
| 410 | |||
| 411 | // Bottom rows | ||
| 412 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 413 | 66 | process_edge_row(src + src_stride * (src_height - 1), | |
| 414 | 33 | dst + dst_stride * (dst_height - 2)); | |
| 415 | 99 | memcpy(dst + dst_stride * (dst_height - 1), | |
| 416 | 66 | dst + dst_stride * (dst_height - 2), dst_stride); | |
| 417 | 33 | } | |
| 418 | |||
| 419 | 34 | return KLEIDICV_OK; | |
| 420 | 34 | } | |
| 421 | |||
| 422 | KLEIDICV_TARGET_FN_ATTRS | ||
| 423 | 95 | kleidicv_error_t resize_linear_stripe_u8(const uint8_t *src, size_t src_stride, | |
| 424 | size_t src_width, size_t src_height, | ||
| 425 | size_t y_begin, size_t y_end, | ||
| 426 | uint8_t *dst, size_t dst_stride, | ||
| 427 | size_t dst_width, size_t dst_height) { | ||
| 428 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 94 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 94 times.
|
95 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 429 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 93 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 93 times.
|
94 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
| 430 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 92 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 91 times.
|
93 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
| 431 | |||
| 432 |
4/4✓ Branch 0 taken 89 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 88 times.
|
91 | if (src_width == 0 || src_height == 0) { |
| 433 | 3 | return KLEIDICV_OK; | |
| 434 | } | ||
| 435 |
3/4✓ Branch 0 taken 54 times.
✓ Branch 1 taken 34 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.
|
88 | if (src_width * 2 == dst_width && src_height * 2 == dst_height) { |
| 436 | 108 | return resize_2x2_u8(src, src_stride, src_width, src_height, y_begin, y_end, | |
| 437 | 54 | dst, dst_stride); | |
| 438 | } | ||
| 439 |
2/4✓ Branch 0 taken 34 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 34 times.
|
34 | if (src_width * 4 == dst_width && src_height * 4 == dst_height) { |
| 440 | 68 | return resize_4x4_u8(src, src_stride, src_width, src_height, y_begin, y_end, | |
| 441 | 34 | dst, dst_stride); | |
| 442 | } | ||
| 443 | // resize_linear_u8_is_implemented checked the kernel size already. | ||
| 444 | // GCOVR_EXCL_START | ||
| 445 | assert(!"resize ratio not implemented"); | ||
| 446 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 447 | // GCOVR_EXCL_STOP | ||
| 448 | 95 | } | |
| 449 | |||
| 450 | 50 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32( | |
| 451 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 452 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
| 453 | 50 | size_t dst_width = src_width * 2; | |
| 454 | 50 | src_stride /= sizeof(float); | |
| 455 | 50 | dst_stride /= sizeof(float); | |
| 456 | |||
| 457 | 874 | auto lerp1d_scalar = [](float near, float far) { | |
| 458 | 824 | return near * 0.75F + far * 0.25F; | |
| 459 | }; | ||
| 460 | |||
| 461 | 2250 | auto lerp1d_vector = [](float32x4_t near, float32x4_t far) { | |
| 462 | 2200 | return vmlaq_n_f32(vmulq_n_f32(near, 0.75F), far, 0.25F); | |
| 463 | }; | ||
| 464 | |||
| 465 | 1282 | auto lerp2d_scalar = [](float near, float mid_a, float mid_b, float far) { | |
| 466 | 1232 | return near * 0.5625F + mid_a * 0.1875F + mid_b * 0.1875F + far * 0.0625F; | |
| 467 | }; | ||
| 468 | |||
| 469 | 8922 | auto lerp2d_vector = [](float32x4_t a, float32x4_t b, float32x4_t c, | |
| 470 | float32x4_t d) { | ||
| 471 | 8872 | return vmlaq_n_f32( | |
| 472 | 8872 | vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, 0.5625F), b, 0.1875F), c, | |
| 473 | 0.1875F), | ||
| 474 | 8872 | d, 0.0625F); | |
| 475 | }; | ||
| 476 | |||
| 477 | // Handle top or bottom edge | ||
| 478 | 148 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 479 | const float *src_row, float *dst_row) { | ||
| 480 | // Left element | ||
| 481 | 98 | dst_row[0] = src_row[0]; | |
| 482 | |||
| 483 | // Right element | ||
| 484 | 98 | dst_row[dst_width - 1] = src_row[src_width - 1]; | |
| 485 | |||
| 486 | // Middle elements | ||
| 487 | 98 | size_t src_x = 0; | |
| 488 |
2/2✓ Branch 0 taken 1100 times.
✓ Branch 1 taken 98 times.
|
1198 | for (; src_x + 4 < src_width; src_x += 4) { |
| 489 | 1100 | size_t dst_x = src_x * 2 + 1; | |
| 490 | 1100 | float32x4_t src_left = vld1q_f32(src_row + src_x); | |
| 491 | 1100 | float32x4_t src_right = vld1q_f32(src_row + src_x + 1); | |
| 492 | |||
| 493 | 1100 | float32x4_t dst_left = lerp1d_vector(src_left, src_right); | |
| 494 | 1100 | float32x4_t dst_right = lerp1d_vector(src_right, src_left); | |
| 495 | |||
| 496 | 1100 | vst2q_f32(dst_row + dst_x, (float32x4x2_t{dst_left, dst_right})); | |
| 497 | 1100 | } | |
| 498 |
2/2✓ Branch 0 taken 116 times.
✓ Branch 1 taken 98 times.
|
214 | for (; src_x + 1 < src_width; ++src_x) { |
| 499 | 116 | size_t dst_x = src_x * 2 + 1; | |
| 500 | 116 | const float src_left = src_row[src_x], src_right = src_row[src_x + 1]; | |
| 501 | 116 | dst_row[dst_x] = lerp1d_scalar(src_left, src_right); | |
| 502 | 116 | dst_row[dst_x + 1] = lerp1d_scalar(src_right, src_left); | |
| 503 | 116 | } | |
| 504 | 98 | }; | |
| 505 | |||
| 506 | 198 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, | |
| 507 | lerp2d_vector](const float *src_row0, | ||
| 508 | const float *src_row1, float *dst_row0, | ||
| 509 | float *dst_row1) { | ||
| 510 | // Left element | ||
| 511 | 148 | dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); | |
| 512 | 148 | dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); | |
| 513 | |||
| 514 | // Right element | ||
| 515 | 148 | dst_row0[dst_width - 1] = | |
| 516 | 148 | lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); | |
| 517 | 148 | dst_row1[dst_width - 1] = | |
| 518 | 148 | lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); | |
| 519 | |||
| 520 | // Middle elements | ||
| 521 | 148 | size_t src_x = 0; | |
| 522 |
2/2✓ Branch 0 taken 2218 times.
✓ Branch 1 taken 148 times.
|
2366 | for (; src_x + 4 < src_width; src_x += 4) { |
| 523 | 2218 | size_t dst_x = src_x * 2 + 1; | |
| 524 | |||
| 525 | 2218 | float32x4_t a = vld1q_f32(src_row0 + src_x); | |
| 526 | 2218 | float32x4_t b = vld1q_f32(src_row0 + src_x + 1); | |
| 527 | 2218 | float32x4_t c = vld1q_f32(src_row1 + src_x); | |
| 528 | 2218 | float32x4_t d = vld1q_f32(src_row1 + src_x + 1); | |
| 529 | |||
| 530 | 2218 | vst2q_f32(dst_row0 + dst_x, (float32x4x2_t{lerp2d_vector(a, b, c, d), | |
| 531 | lerp2d_vector(b, a, d, c)})); | ||
| 532 | 2218 | vst2q_f32(dst_row1 + dst_x, (float32x4x2_t{lerp2d_vector(c, a, d, b), | |
| 533 | lerp2d_vector(d, b, c, a)})); | ||
| 534 | 2218 | } | |
| 535 |
2/2✓ Branch 0 taken 308 times.
✓ Branch 1 taken 148 times.
|
456 | for (; src_x + 1 < src_width; ++src_x) { |
| 536 | 308 | size_t dst_x = src_x * 2 + 1; | |
| 537 | 616 | const float a = src_row0[src_x], b = src_row0[src_x + 1], | |
| 538 | 616 | c = src_row1[src_x], d = src_row1[src_x + 1]; | |
| 539 | 308 | dst_row0[dst_x] = lerp2d_scalar(a, b, c, d); | |
| 540 | 308 | dst_row0[dst_x + 1] = lerp2d_scalar(b, a, d, c); | |
| 541 | 308 | dst_row1[dst_x] = lerp2d_scalar(c, a, d, b); | |
| 542 | 308 | dst_row1[dst_x + 1] = lerp2d_scalar(d, b, c, a); | |
| 543 | 308 | } | |
| 544 | 148 | }; | |
| 545 | |||
| 546 | // Top row | ||
| 547 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
|
50 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 548 | 49 | process_edge_row(src, dst); | |
| 549 | 49 | } | |
| 550 | |||
| 551 | // Middle rows | ||
| 552 |
2/2✓ Branch 0 taken 148 times.
✓ Branch 1 taken 50 times.
|
198 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 553 | 148 | size_t dst_y = src_y * 2 + 1; | |
| 554 | 148 | const float *src_row0 = src + src_stride * src_y; | |
| 555 | 148 | const float *src_row1 = src_row0 + src_stride; | |
| 556 | 148 | float *dst_row0 = dst + dst_stride * dst_y; | |
| 557 | 148 | float *dst_row1 = dst_row0 + dst_stride; | |
| 558 | |||
| 559 | 148 | process_row(src_row0, src_row1, dst_row0, dst_row1); | |
| 560 | 148 | } | |
| 561 | |||
| 562 | // Bottom row | ||
| 563 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
|
50 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 564 | 98 | process_edge_row(src + src_stride * (src_height - 1), | |
| 565 | 49 | dst + dst_stride * (src_height * 2 - 1)); | |
| 566 | 49 | } | |
| 567 | |||
| 568 | 50 | return KLEIDICV_OK; | |
| 569 | 50 | } | |
| 570 | |||
| 571 | 38 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( | |
| 572 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 573 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
| 574 | using T = float; | ||
| 575 | 38 | size_t dst_height = src_height * 4; | |
| 576 | 38 | size_t dst_width = src_width * 4; | |
| 577 | 38 | src_stride /= sizeof(T); | |
| 578 | 38 | dst_stride /= sizeof(T); | |
| 579 | |||
| 580 | 1398 | auto lerp1d_scalar = [](T coeff_a, T a, T coeff_b, T b) { | |
| 581 | 1360 | return coeff_a * a + coeff_b * b; | |
| 582 | }; | ||
| 583 | 22182 | auto lerp1d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b) { | |
| 584 | 22144 | return vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b); | |
| 585 | }; | ||
| 586 | 4198 | auto lerp2d_scalar = [](T coeff_a, T a, T coeff_b, T b, T coeff_c, T c, | |
| 587 | T coeff_d, T d) { | ||
| 588 | 4160 | return coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d; | |
| 589 | }; | ||
| 590 | 17782 | auto lerp2d_vector = [](T coeff_a, float32x4_t a, T coeff_b, float32x4_t b, | |
| 591 | T coeff_c, float32x4_t c, T coeff_d, float32x4_t d) { | ||
| 592 | 17744 | return vmlaq_n_f32( | |
| 593 | 35488 | vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(a, coeff_a), b, coeff_b), c, | |
| 594 | 17744 | coeff_c), | |
| 595 | 17744 | d, coeff_d); | |
| 596 | }; | ||
| 597 | // Handle top or bottom edge | ||
| 598 | 112 | auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( | |
| 599 | const T *src_row, T *dst_row) { | ||
| 600 | // Left elements | ||
| 601 | 74 | dst_row[1] = dst_row[0] = src_row[0]; | |
| 602 | |||
| 603 | // Right elements | ||
| 604 | 74 | dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; | |
| 605 | |||
| 606 | // Middle elements | ||
| 607 | 74 | size_t src_x = 0; | |
| 608 |
2/2✓ Branch 0 taken 1100 times.
✓ Branch 1 taken 74 times.
|
1174 | for (; src_x + 4 < src_width; src_x += 4) { |
| 609 | 1100 | size_t dst_x = src_x * 4 + 2; | |
| 610 | 1100 | float32x4_t a = vld1q_f32(src_row + src_x); | |
| 611 | 1100 | float32x4_t b = vld1q_f32(src_row + src_x + 1); | |
| 612 | 1100 | vst4q_f32(dst_row + dst_x, | |
| 613 | (float32x4x4_t{lerp1d_vector(0.875F, a, 0.125F, b), | ||
| 614 | lerp1d_vector(0.625F, a, 0.375F, b), | ||
| 615 | lerp1d_vector(0.375F, a, 0.625F, b), | ||
| 616 | lerp1d_vector(0.125F, a, 0.875F, b)})); | ||
| 617 | 1100 | } | |
| 618 |
2/2✓ Branch 0 taken 76 times.
✓ Branch 1 taken 74 times.
|
150 | for (; src_x + 1 < src_width; ++src_x) { |
| 619 | 76 | size_t dst_x = src_x * 4 + 2; | |
| 620 | 76 | const T a = src_row[src_x], b = src_row[src_x + 1]; | |
| 621 | 76 | dst_row[dst_x + 0] = lerp1d_scalar(0.875F, a, 0.125F, b); | |
| 622 | 76 | dst_row[dst_x + 1] = lerp1d_scalar(0.625F, a, 0.375F, b); | |
| 623 | 76 | dst_row[dst_x + 2] = lerp1d_scalar(0.375F, a, 0.625F, b); | |
| 624 | 76 | dst_row[dst_x + 3] = lerp1d_scalar(0.125F, a, 0.875F, b); | |
| 625 | 76 | } | |
| 626 | 74 | }; | |
| 627 | |||
| 628 | 170 | auto process_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector, | |
| 629 | lerp2d_scalar, lerp2d_vector]( | ||
| 630 | const T *src_row0, const T *src_row1, T *dst_row0, | ||
| 631 | T *dst_row1, T *dst_row2, T *dst_row3) { | ||
| 632 | // Left elements | ||
| 633 | 132 | const T s0l = src_row0[0], s1l = src_row1[0]; | |
| 634 | 132 | dst_row0[0] = dst_row0[1] = lerp1d_scalar(0.875F, s0l, 0.125F, s1l); | |
| 635 | 132 | dst_row1[0] = dst_row1[1] = lerp1d_scalar(0.625F, s0l, 0.375F, s1l); | |
| 636 | 132 | dst_row2[0] = dst_row2[1] = lerp1d_scalar(0.375F, s0l, 0.625F, s1l); | |
| 637 | 132 | dst_row3[0] = dst_row3[1] = lerp1d_scalar(0.125F, s0l, 0.875F, s1l); | |
| 638 | |||
| 639 | // Right elements | ||
| 640 | 132 | const T s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; | |
| 641 | 132 | const size_t dr0 = dst_width - 2; | |
| 642 | 132 | const size_t dr1 = dst_width - 1; | |
| 643 | 132 | dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(0.875F, s0r, 0.125F, s1r); | |
| 644 | 132 | dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(0.625F, s0r, 0.375F, s1r); | |
| 645 | 132 | dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(0.375F, s0r, 0.625F, s1r); | |
| 646 | 132 | dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(0.125F, s0r, 0.875F, s1r); | |
| 647 | |||
| 648 | // Middle elements | ||
| 649 | 132 | size_t src_x = 0; | |
| 650 |
2/2✓ Branch 0 taken 2218 times.
✓ Branch 1 taken 132 times.
|
2350 | for (; src_x + 4 < src_width; src_x += 4) { |
| 651 | 2218 | size_t dst_x = src_x * 4 + 2; | |
| 652 | |||
| 653 | 2218 | float32x4_t a = vld1q_f32(src_row0 + src_x); | |
| 654 | 2218 | float32x4_t b = vld1q_f32(src_row0 + src_x + 1); | |
| 655 | 2218 | float32x4_t c = vld1q_f32(src_row1 + src_x); | |
| 656 | 2218 | float32x4_t d = vld1q_f32(src_row1 + src_x + 1); | |
| 657 | |||
| 658 | 2218 | float32x4x4_t dst_a{ | |
| 659 | 8872 | lerp2d_vector(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d), | |
| 660 | 2218 | lerp2d_vector(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d), | |
| 661 | 2218 | lerp2d_vector(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d), | |
| 662 | 2218 | lerp2d_vector(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d), | |
| 663 | }; | ||
| 664 | 2218 | float32x4x4_t dst_d{ | |
| 665 | 8872 | lerp2d_vector(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d), | |
| 666 | 2218 | lerp2d_vector(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d), | |
| 667 | 2218 | lerp2d_vector(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d), | |
| 668 | 2218 | lerp2d_vector(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d), | |
| 669 | }; | ||
| 670 | 2218 | const float one_3rd = 0.3333333333333333F; | |
| 671 | 2218 | const float two_3rd = 0.6666666666666667F; | |
| 672 | 2218 | vst4q_f32(dst_row0 + dst_x, dst_a); | |
| 673 | 2218 | vst4q_f32(dst_row1 + dst_x, | |
| 674 | (float32x4x4_t{ | ||
| 675 | lerp1d_vector(two_3rd, dst_a.val[0], one_3rd, dst_d.val[0]), | ||
| 676 | lerp1d_vector(two_3rd, dst_a.val[1], one_3rd, dst_d.val[1]), | ||
| 677 | lerp1d_vector(two_3rd, dst_a.val[2], one_3rd, dst_d.val[2]), | ||
| 678 | lerp1d_vector(two_3rd, dst_a.val[3], one_3rd, dst_d.val[3]), | ||
| 679 | })); | ||
| 680 | 2218 | vst4q_f32(dst_row2 + dst_x, | |
| 681 | (float32x4x4_t{ | ||
| 682 | lerp1d_vector(one_3rd, dst_a.val[0], two_3rd, dst_d.val[0]), | ||
| 683 | lerp1d_vector(one_3rd, dst_a.val[1], two_3rd, dst_d.val[1]), | ||
| 684 | lerp1d_vector(one_3rd, dst_a.val[2], two_3rd, dst_d.val[2]), | ||
| 685 | lerp1d_vector(one_3rd, dst_a.val[3], two_3rd, dst_d.val[3]), | ||
| 686 | })); | ||
| 687 | 2218 | vst4q_f32(dst_row3 + dst_x, dst_d); | |
| 688 | 2218 | } | |
| 689 | |||
| 690 |
2/2✓ Branch 0 taken 260 times.
✓ Branch 1 taken 132 times.
|
392 | for (; src_x + 1 < src_width; ++src_x) { |
| 691 | 260 | size_t dst_x = src_x * 4 + 2; | |
| 692 | 260 | const T a = src_row0[src_x], b = src_row0[src_x + 1], c = src_row1[src_x], | |
| 693 | 260 | d = src_row1[src_x + 1]; | |
| 694 | |||
| 695 | 260 | dst_row0[dst_x + 0] = | |
| 696 | 260 | lerp2d_scalar(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d); | |
| 697 | 260 | dst_row0[dst_x + 1] = | |
| 698 | 260 | lerp2d_scalar(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d); | |
| 699 | 260 | dst_row0[dst_x + 2] = | |
| 700 | 260 | lerp2d_scalar(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d); | |
| 701 | 260 | dst_row0[dst_x + 3] = | |
| 702 | 260 | lerp2d_scalar(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d); | |
| 703 | 260 | dst_row1[dst_x + 0] = | |
| 704 | 260 | lerp2d_scalar(0.546875F, a, 0.078125F, b, 0.328125F, c, 0.046875F, d); | |
| 705 | 260 | dst_row1[dst_x + 1] = | |
| 706 | 260 | lerp2d_scalar(0.390625F, a, 0.234375F, b, 0.234375F, c, 0.140625F, d); | |
| 707 | 260 | dst_row1[dst_x + 2] = | |
| 708 | 260 | lerp2d_scalar(0.234375F, a, 0.390625F, b, 0.140625F, c, 0.234375F, d); | |
| 709 | 260 | dst_row1[dst_x + 3] = | |
| 710 | 260 | lerp2d_scalar(0.078125F, a, 0.546875F, b, 0.046875F, c, 0.328125F, d); | |
| 711 | 260 | dst_row2[dst_x + 0] = | |
| 712 | 260 | lerp2d_scalar(0.328125F, a, 0.046875F, b, 0.546875F, c, 0.078125F, d); | |
| 713 | 260 | dst_row2[dst_x + 1] = | |
| 714 | 260 | lerp2d_scalar(0.234375F, a, 0.140625F, b, 0.390625F, c, 0.234375F, d); | |
| 715 | 260 | dst_row2[dst_x + 2] = | |
| 716 | 260 | lerp2d_scalar(0.140625F, a, 0.234375F, b, 0.234375F, c, 0.390625F, d); | |
| 717 | 260 | dst_row2[dst_x + 3] = | |
| 718 | 260 | lerp2d_scalar(0.046875F, a, 0.328125F, b, 0.078125F, c, 0.546875F, d); | |
| 719 | 260 | dst_row3[dst_x + 0] = | |
| 720 | 260 | lerp2d_scalar(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d); | |
| 721 | 260 | dst_row3[dst_x + 1] = | |
| 722 | 260 | lerp2d_scalar(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d); | |
| 723 | 260 | dst_row3[dst_x + 2] = | |
| 724 | 260 | lerp2d_scalar(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d); | |
| 725 | 260 | dst_row3[dst_x + 3] = | |
| 726 | 260 | lerp2d_scalar(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d); | |
| 727 | 260 | } | |
| 728 | 132 | }; | |
| 729 | |||
| 730 | // Top rows | ||
| 731 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 37 times.
|
38 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 732 | 37 | process_edge_row(src, dst); | |
| 733 | 37 | memcpy(dst + dst_stride, dst, dst_stride * sizeof(T)); | |
| 734 | 37 | } | |
| 735 | |||
| 736 | // Middle rows | ||
| 737 |
2/2✓ Branch 0 taken 132 times.
✓ Branch 1 taken 38 times.
|
170 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 738 | 132 | size_t dst_y = src_y * 4 + 2; | |
| 739 | 132 | const T *src_row0 = src + src_stride * src_y; | |
| 740 | 132 | const T *src_row1 = src_row0 + src_stride; | |
| 741 | 132 | T *dst_row0 = dst + dst_stride * dst_y; | |
| 742 | 132 | T *dst_row1 = dst_row0 + dst_stride; | |
| 743 | 132 | T *dst_row2 = dst_row1 + dst_stride; | |
| 744 | 132 | T *dst_row3 = dst_row2 + dst_stride; | |
| 745 | |||
| 746 | 132 | process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); | |
| 747 | 132 | } | |
| 748 | |||
| 749 | // Bottom rows | ||
| 750 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 37 times.
|
38 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 751 | 74 | process_edge_row(src + src_stride * (src_height - 1), | |
| 752 | 37 | dst + dst_stride * (dst_height - 2)); | |
| 753 | 111 | memcpy(dst + dst_stride * (dst_height - 1), | |
| 754 | 74 | dst + dst_stride * (dst_height - 2), dst_stride * sizeof(T)); | |
| 755 | 37 | } | |
| 756 | |||
| 757 | 38 | return KLEIDICV_OK; | |
| 758 | 38 | } | |
| 759 | |||
| 760 | 34 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( | |
| 761 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 762 | size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { | ||
| 763 | 34 | size_t dst_width = src_width * 8; | |
| 764 | 34 | size_t dst_height = src_height * 8; | |
| 765 | 34 | src_stride /= sizeof(float); | |
| 766 | 34 | dst_stride /= sizeof(float); | |
| 767 | |||
| 768 | 34 | float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0, | |
| 769 | 7 / 16.0, 5 / 16.0, 3 / 16.0, 1 / 16.0}; | ||
| 770 | 34 | float coeffs_b[] = {1 / 16.0, 3 / 16.0, 5 / 16.0, 7 / 16.0, | |
| 771 | 9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0}; | ||
| 772 | 34 | float32x4_t coeffs_a0 = vld1q_f32(&coeffs_a[0]); | |
| 773 | 34 | float32x4_t coeffs_a1 = vld1q_f32(&coeffs_a[4]); | |
| 774 | 34 | float32x4_t coeffs_b0 = vld1q_f32(&coeffs_b[0]); | |
| 775 | 34 | float32x4_t coeffs_b1 = vld1q_f32(&coeffs_b[4]); | |
| 776 | |||
| 777 | 2082 | auto lerp1d_vector_n = [](float p, float32x4_t a, float q, float32x4_t b) { | |
| 778 | 2048 | return vmlaq_n_f32(vmulq_n_f32(a, p), b, q); | |
| 779 | }; | ||
| 780 | |||
| 781 | 109570 | auto lerp1d_vector_n2 = [](float32x4_t a, float q, float32x4_t b) { | |
| 782 | 109536 | return vmlaq_n_f32(a, b, q); | |
| 783 | }; | ||
| 784 | |||
| 785 | 8970 | auto lerp1d_vector = [](float32x4_t p, float32x4_t a, float32x4_t q, | |
| 786 | float32x4_t b) { | ||
| 787 | 8936 | return vmlaq_f32(vmulq_f32(a, p), b, q); | |
| 788 | }; | ||
| 789 | |||
| 790 | // Handle top or bottom edge | ||
| 791 | 34 | auto process_edge_row = | |
| 792 | 100 | [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, | |
| 793 | &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) { | ||
| 794 | // Left elements | ||
| 795 | 66 | dst_row[3] = dst_row[2] = dst_row[1] = dst_row[0] = src_row[0]; | |
| 796 | 66 | dst_row[dst_stride + 3] = dst_row[dst_stride + 2] = | |
| 797 | 66 | dst_row[dst_stride + 1] = dst_row[dst_stride] = src_row[0]; | |
| 798 | 66 | dst_row[2 * dst_stride + 3] = dst_row[2 * dst_stride + 2] = | |
| 799 | 66 | dst_row[2 * dst_stride + 1] = dst_row[2 * dst_stride] = src_row[0]; | |
| 800 | 66 | dst_row[3 * dst_stride + 3] = dst_row[3 * dst_stride + 2] = | |
| 801 | 66 | dst_row[3 * dst_stride + 1] = dst_row[3 * dst_stride] = src_row[0]; | |
| 802 | |||
| 803 | // Right elements | ||
| 804 | 66 | float *dst_right = dst_row + dst_width - 4; | |
| 805 | 66 | dst_right[3] = dst_right[2] = dst_right[1] = dst_right[0] = | |
| 806 | 66 | src_row[src_width - 1]; | |
| 807 | 66 | dst_right[dst_stride + 3] = dst_right[dst_stride + 2] = | |
| 808 | 66 | dst_right[dst_stride + 1] = dst_right[dst_stride] = | |
| 809 | 66 | src_row[src_width - 1]; | |
| 810 | 66 | dst_right[2 * dst_stride + 3] = dst_right[2 * dst_stride + 2] = | |
| 811 | 66 | dst_right[2 * dst_stride + 1] = dst_right[2 * dst_stride] = | |
| 812 | 66 | src_row[src_width - 1]; | |
| 813 | 66 | dst_right[3 * dst_stride + 3] = dst_right[3 * dst_stride + 2] = | |
| 814 | 66 | dst_right[3 * dst_stride + 1] = dst_right[3 * dst_stride] = | |
| 815 | 66 | src_row[src_width - 1]; | |
| 816 | |||
| 817 | // Middle elements | ||
| 818 | 66 | float32x4_t a, b = vdupq_n_f32(src_row[0]); | |
| 819 |
2/2✓ Branch 0 taken 66 times.
✓ Branch 1 taken 4468 times.
|
4534 | for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { |
| 820 | 4468 | a = b; | |
| 821 | 4468 | b = vdupq_n_f32(src_row[src_x + 1]); | |
| 822 | 4468 | float *dst_row0 = dst_row + src_x * 8 + 4; | |
| 823 | 4468 | float *dst_row1 = dst_row0 + dst_stride; | |
| 824 | 4468 | float *dst_row2 = dst_row1 + dst_stride; | |
| 825 | 4468 | float *dst_row3 = dst_row2 + dst_stride; | |
| 826 | 4468 | float32x4_t dst = lerp1d_vector(coeffs_a0, a, coeffs_b0, b); | |
| 827 | 4468 | vst1q(dst_row0, dst); | |
| 828 | 4468 | vst1q(dst_row1, dst); | |
| 829 | 4468 | vst1q(dst_row2, dst); | |
| 830 | 4468 | vst1q(dst_row3, dst); | |
| 831 | 4468 | dst = lerp1d_vector(coeffs_a1, a, coeffs_b1, b); | |
| 832 | 4468 | vst1q(dst_row0 + 4, dst); | |
| 833 | 4468 | vst1q(dst_row1 + 4, dst); | |
| 834 | 4468 | vst1q(dst_row2 + 4, dst); | |
| 835 | 4468 | vst1q(dst_row3 + 4, dst); | |
| 836 | 4468 | } | |
| 837 | 66 | }; | |
| 838 | |||
| 839 | 34 | float32x4_t coeffs_p0 = vmulq_n_f32(coeffs_a0, 15.0 / 16); | |
| 840 | 34 | float32x4_t coeffs_q0 = vmulq_n_f32(coeffs_b0, 15.0 / 16); | |
| 841 | 34 | float32x4_t coeffs_r0 = vmulq_n_f32(coeffs_a0, 1.0 / 16); | |
| 842 | 34 | float32x4_t coeffs_s0 = vmulq_n_f32(coeffs_b0, 1.0 / 16); | |
| 843 | 34 | float32x4_t coeffs_p1 = vmulq_n_f32(coeffs_a1, 15.0 / 16); | |
| 844 | 34 | float32x4_t coeffs_q1 = vmulq_n_f32(coeffs_b1, 15.0 / 16); | |
| 845 | 34 | float32x4_t coeffs_r1 = vmulq_n_f32(coeffs_a1, 1.0 / 16); | |
| 846 | 34 | float32x4_t coeffs_s1 = vmulq_n_f32(coeffs_b1, 1.0 / 16); | |
| 847 | |||
| 848 | 36546 | auto lerp2d_vector = [](float32x4_t a, float32x4_t p, float32x4_t b, | |
| 849 | float32x4_t q, float32x4_t c, float32x4_t r, | ||
| 850 | float32x4_t d, float32x4_t s) { | ||
| 851 | 36512 | return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(a, p), b, q), c, r), d, s); | |
| 852 | }; | ||
| 853 | |||
| 854 | 162 | auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n, | |
| 855 | lerp1d_vector_n2, &coeffs_p0, &coeffs_q0, &coeffs_r0, | ||
| 856 | &coeffs_s0, &coeffs_p1, &coeffs_q1, &coeffs_r1, | ||
| 857 | &coeffs_s1](const float *src_row0, const float *src_row1, | ||
| 858 | float *dst_row0, size_t dst_stride) { | ||
| 859 | // Left elements | ||
| 860 | 128 | float32x4_t s0 = vdupq_n_f32(src_row0[0]); | |
| 861 | 128 | float32x4_t s1 = vdupq_n_f32(src_row1[0]); | |
| 862 | 128 | float *dst_row = dst_row0; | |
| 863 |
2/2✓ Branch 0 taken 1024 times.
✓ Branch 1 taken 128 times.
|
1152 | for (size_t i = 0; i < 8; ++i) { |
| 864 | 2048 | vst1q(dst_row, | |
| 865 | 2048 | lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0, | |
| 866 | 1024 | static_cast<float>(i * 2 + 1) / 16.0F, s1)); | |
| 867 | 1024 | dst_row += dst_stride; | |
| 868 | 1024 | } | |
| 869 | |||
| 870 | // Middle elements | ||
| 871 | 128 | dst_row0 += 4; | |
| 872 | 128 | float *dst_row1 = dst_row0 + dst_stride; | |
| 873 | 128 | float *dst_row2 = dst_row1 + dst_stride; | |
| 874 | 128 | float *dst_row3 = dst_row2 + dst_stride; | |
| 875 | 128 | float *dst_row4 = dst_row3 + dst_stride; | |
| 876 | 128 | float *dst_row5 = dst_row4 + dst_stride; | |
| 877 | 128 | float *dst_row6 = dst_row5 + dst_stride; | |
| 878 | 128 | float *dst_row7 = dst_row6 + dst_stride; | |
| 879 | 128 | float32x4_t a, b = s0; | |
| 880 | 128 | float32x4_t c, d = s1; | |
| 881 |
2/2✓ Branch 0 taken 9128 times.
✓ Branch 1 taken 128 times.
|
9256 | for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { |
| 882 | 9128 | a = b; | |
| 883 | 9128 | b = vdupq_n_f32(src_row0[src_x + 1]); | |
| 884 | 9128 | c = d; | |
| 885 | 9128 | d = vdupq_n_f32(src_row1[src_x + 1]); | |
| 886 | 9128 | float32x4x2_t dst_0; | |
| 887 | 9128 | dst_0.val[0] = | |
| 888 | 9128 | lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d); | |
| 889 | 9128 | dst_0.val[1] = | |
| 890 | 9128 | lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d); | |
| 891 | |||
| 892 | 9128 | neon::VecTraits<float>::store(dst_0, dst_row0); | |
| 893 | 9128 | float32x4x2_t dst_7; | |
| 894 | 9128 | dst_7.val[0] = | |
| 895 | 9128 | lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d); | |
| 896 | 9128 | dst_7.val[1] = | |
| 897 | 9128 | lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d); | |
| 898 | |||
| 899 | 9128 | neon::VecTraits<float>::store(dst_7, dst_row7); | |
| 900 | 9128 | float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]); | |
| 901 | 9128 | float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]); | |
| 902 | |||
| 903 | 9128 | float32x4x2_t dst; | |
| 904 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0); | |
| 905 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1); | |
| 906 | |||
| 907 | 9128 | neon::VecTraits<float>::store(dst, dst_row1); | |
| 908 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0); | |
| 909 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1); | |
| 910 | |||
| 911 | 9128 | neon::VecTraits<float>::store(dst, dst_row2); | |
| 912 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0); | |
| 913 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1); | |
| 914 | |||
| 915 | 9128 | neon::VecTraits<float>::store(dst, dst_row3); | |
| 916 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0); | |
| 917 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1); | |
| 918 | |||
| 919 | 9128 | neon::VecTraits<float>::store(dst, dst_row4); | |
| 920 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0); | |
| 921 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1); | |
| 922 | |||
| 923 | 9128 | neon::VecTraits<float>::store(dst, dst_row5); | |
| 924 | 9128 | dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0); | |
| 925 | 9128 | dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1); | |
| 926 | |||
| 927 | 9128 | neon::VecTraits<float>::store(dst, dst_row6); | |
| 928 | 9128 | dst_row0 += 8; | |
| 929 | 9128 | dst_row1 += 8; | |
| 930 | 9128 | dst_row2 += 8; | |
| 931 | 9128 | dst_row3 += 8; | |
| 932 | 9128 | dst_row4 += 8; | |
| 933 | 9128 | dst_row5 += 8; | |
| 934 | 9128 | dst_row6 += 8; | |
| 935 | 9128 | dst_row7 += 8; | |
| 936 | 9128 | } | |
| 937 | |||
| 938 | // Right elements | ||
| 939 | 128 | s0 = b; | |
| 940 | 128 | s1 = d; | |
| 941 | 128 | dst_row = dst_row0; | |
| 942 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 1024 times.
|
1152 | for (size_t i = 0; i < 8; ++i) { |
| 943 | 2048 | vst1q(dst_row, | |
| 944 | 2048 | lerp1d_vector_n(static_cast<float>(15 - i * 2) / 16.0F, s0, | |
| 945 | 1024 | static_cast<float>(i * 2 + 1) / 16.0F, s1)); | |
| 946 | 1024 | dst_row += dst_stride; | |
| 947 | 1024 | } | |
| 948 | 128 | }; | |
| 949 | |||
| 950 | // Top rows | ||
| 951 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_begin == 0)) { |
| 952 | 33 | process_edge_row(src, dst, dst_stride); | |
| 953 | 33 | } | |
| 954 | |||
| 955 | // Middle rows | ||
| 956 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.
|
162 | for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { |
| 957 | 128 | size_t dst_y = src_y * 8 + 4; | |
| 958 | 128 | const float *src_row0 = src + src_stride * src_y; | |
| 959 | 128 | const float *src_row1 = src_row0 + src_stride; | |
| 960 | 128 | process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride); | |
| 961 | 128 | } | |
| 962 | |||
| 963 | // Bottom rows | ||
| 964 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.
|
34 | if (KLEIDICV_LIKELY(y_end == src_height)) { |
| 965 | 66 | process_edge_row(src + src_stride * (src_height - 1), | |
| 966 | 33 | dst + dst_stride * (dst_height - 4), dst_stride); | |
| 967 | 33 | } | |
| 968 | |||
| 969 | 34 | return KLEIDICV_OK; | |
| 970 | 34 | } | |
| 971 | |||
| 972 | 129 | kleidicv_error_t resize_linear_stripe_f32(const float *src, size_t src_stride, | |
| 973 | size_t src_width, size_t src_height, | ||
| 974 | size_t y_begin, size_t y_end, | ||
| 975 | float *dst, size_t dst_stride, | ||
| 976 | size_t dst_width, size_t dst_height) { | ||
| 977 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 128 times.
|
129 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 978 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 127 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 127 times.
|
128 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
| 979 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 126 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 125 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 125 times.
|
127 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
| 980 | |||
| 981 |
4/4✓ Branch 0 taken 123 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 122 times.
|
125 | if (src_width == 0 || src_height == 0) { |
| 982 | 3 | return KLEIDICV_OK; | |
| 983 | } | ||
| 984 |
3/4✓ Branch 0 taken 50 times.
✓ Branch 1 taken 72 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 50 times.
|
122 | if (src_width * 2 == dst_width && src_height * 2 == dst_height) { |
| 985 | 100 | return resize_2x2_f32(src, src_stride, src_width, src_height, y_begin, | |
| 986 | 50 | y_end, dst, dst_stride); | |
| 987 | } | ||
| 988 |
3/4✓ Branch 0 taken 38 times.
✓ Branch 1 taken 34 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 38 times.
|
72 | if (src_width * 4 == dst_width && src_height * 4 == dst_height) { |
| 989 | 76 | return resize_4x4_f32(src, src_stride, src_width, src_height, y_begin, | |
| 990 | 38 | y_end, dst, dst_stride); | |
| 991 | } | ||
| 992 |
2/4✓ Branch 0 taken 34 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 34 times.
|
34 | if (src_width * 8 == dst_width && src_height * 8 == dst_height) { |
| 993 | 68 | return resize_8x8_f32(src, src_stride, src_width, src_height, y_begin, | |
| 994 | 34 | y_end, dst, dst_stride); | |
| 995 | } | ||
| 996 | // resize_linear_f32_is_implemented checked the kernel size already. | ||
| 997 | // GCOVR_EXCL_START | ||
| 998 | assert(!"resize ratio not implemented"); | ||
| 999 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1000 | // GCOVR_EXCL_STOP | ||
| 1001 | 129 | } | |
| 1002 | |||
| 1003 | } // namespace kleidicv::neon | ||
| 1004 |