| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_RESIZE_SC_H | ||
| 6 | #define KLEIDICV_RESIZE_SC_H | ||
| 7 | |||
| 8 | #include "kleidicv/kleidicv.h" | ||
| 9 | #include "kleidicv/sve2.h" | ||
| 10 | |||
| 11 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 12 | |||
| 13 | 512 | static inline svuint8_t resize_parallel_vectors( | |
| 14 | svbool_t pg, svuint8_t top_row, svuint8_t bottom_row) KLEIDICV_STREAMING { | ||
| 15 | 512 | svuint16_t result_before_averaging_b = svaddlb(top_row, bottom_row); | |
| 16 | 512 | svuint16_t result_before_averaging_t = svaddlt(top_row, bottom_row); | |
| 17 | 1024 | svuint16_t result_before_averaging = | |
| 18 | 512 | svadd_x(pg, result_before_averaging_b, result_before_averaging_t); | |
| 19 | 1024 | return svrshrnb(result_before_averaging, 2); | |
| 20 | 512 | } | |
| 21 | |||
| 22 | 96 | static inline void parallel_rows_vectors_path_2x( | |
| 23 | svbool_t pg, Rows<const uint8_t> src_rows, | ||
| 24 | Rows<uint8_t> dst_rows) KLEIDICV_STREAMING { | ||
| 25 | #if KLEIDICV_TARGET_SME2 | ||
| 26 | 24 | svcount_t pg_counter = svptrue_c8(); | |
| 27 | 24 | auto src_top = svld1_x2(pg_counter, &src_rows.at(0)[0]); | |
| 28 | 24 | auto src_bottom = svld1_x2(pg_counter, &src_rows.at(1)[0]); | |
| 29 | 24 | svuint8_t top_row_0 = svget2(src_top, 0); | |
| 30 | 24 | svuint8_t top_row_1 = svget2(src_top, 1); | |
| 31 | 24 | svuint8_t bottom_row_0 = svget2(src_bottom, 0); | |
| 32 | 24 | svuint8_t bottom_row_1 = svget2(src_bottom, 1); | |
| 33 | #else | ||
| 34 | 72 | svuint8_t top_row_0 = svld1(pg, &src_rows.at(0)[0]); | |
| 35 | 72 | svuint8_t bottom_row_0 = svld1(pg, &src_rows.at(1)[0]); | |
| 36 | 72 | svuint8_t top_row_1 = svld1_vnum(pg, &src_rows.at(0)[0], 1); | |
| 37 | 72 | svuint8_t bottom_row_1 = svld1_vnum(pg, &src_rows.at(1)[0], 1); | |
| 38 | #endif // KLEIDICV_TARGET_SME2 | ||
| 39 | 96 | svuint16_t sum0b = svaddlb(top_row_0, bottom_row_0); | |
| 40 | 96 | svuint16_t sum0t = svaddlt(top_row_0, bottom_row_0); | |
| 41 | 96 | svuint16_t sum1b = svaddlb(top_row_1, bottom_row_1); | |
| 42 | 96 | svuint16_t sum1t = svaddlt(top_row_1, bottom_row_1); | |
| 43 | 96 | svuint8_t res0 = svrshrnb(svadd_x(pg, sum0b, sum0t), 2); | |
| 44 | 96 | svuint8_t res1 = svrshrnb(svadd_x(pg, sum1b, sum1t), 2); | |
| 45 | 96 | svuint8_t result = svuzp1(res0, res1); | |
| 46 | 96 | svst1(pg, &dst_rows[0], result); | |
| 47 | 96 | } | |
| 48 | |||
| 49 | 512 | static inline void parallel_rows_vectors_path( | |
| 50 | svbool_t pg, Rows<const uint8_t> src_rows, | ||
| 51 | Rows<uint8_t> dst_rows) KLEIDICV_STREAMING { | ||
| 52 | 512 | svuint8_t top_line = svld1(pg, &src_rows.at(0)[0]); | |
| 53 | 512 | svuint8_t bottom_line = svld1(pg, &src_rows.at(1)[0]); | |
| 54 | 512 | svuint8_t result = resize_parallel_vectors(pg, top_line, bottom_line); | |
| 55 | 512 | svst1b(pg, &dst_rows[0], svreinterpret_u16_u8(result)); | |
| 56 | 512 | } | |
| 57 | |||
| 58 | template <typename ScalarType> | ||
| 59 | 584 | static inline void process_parallel_rows(Rows<const ScalarType> src_rows, | |
| 60 | size_t src_width, | ||
| 61 | Rows<ScalarType> dst_rows, | ||
| 62 | size_t dst_width) KLEIDICV_STREAMING { | ||
| 63 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
| 64 | 584 | const size_t size_mask = ~static_cast<size_t>(1U); | |
| 65 | |||
| 66 | // Process rows up to the last even pixel index. | ||
| 67 | 1168 | LoopUnroll2{src_width & size_mask, VecTraits::num_lanes()} | |
| 68 | // Process double vector chunks. | ||
| 69 | 680 | .unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
| 70 | 96 | auto pg = VecTraits::svptrue(); | |
| 71 | 192 | parallel_rows_vectors_path_2x(pg, src_rows.at(0, index), | |
| 72 | 96 | dst_rows.at(0, index / 2)); | |
| 73 | 96 | }) | |
| 74 | 632 | .unroll_once([&](size_t index) KLEIDICV_STREAMING { | |
| 75 | 48 | auto pg = VecTraits::svptrue(); | |
| 76 | 96 | parallel_rows_vectors_path(pg, src_rows.at(0, index), | |
| 77 | 48 | dst_rows.at(0, index / 2)); | |
| 78 | 48 | }) | |
| 79 | // Process the remaining chunk of the row. | ||
| 80 | 1048 | .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
| 81 | 464 | auto pg = VecTraits::svwhilelt(index, length); | |
| 82 | 928 | parallel_rows_vectors_path(pg, src_rows.at(0, index), | |
| 83 | 464 | dst_rows.at(0, index / 2)); | |
| 84 | 464 | }); | |
| 85 | |||
| 86 | // Handle the last odd column, if any. | ||
| 87 |
2/2✓ Branch 0 taken 504 times.
✓ Branch 1 taken 80 times.
|
584 | if (dst_width > (src_width / 2)) { |
| 88 | 80 | dst_rows[dst_width - 1] = rounding_shift_right<uint16_t>( | |
| 89 | 160 | static_cast<const uint16_t>(src_rows.at(0, src_width - 1)[0]) + | |
| 90 | 80 | src_rows.at(1, src_width - 1)[0], | |
| 91 | 1); | ||
| 92 | 80 | } | |
| 93 | 584 | } | |
| 94 | |||
| 95 | static inline svuint8_t resize_single_row(svbool_t pg, | ||
| 96 | svuint8_t row) KLEIDICV_STREAMING { | ||
| 97 | return svrshrnb(svadalp_x(pg, svdup_u16(0), row), 1); | ||
| 98 | } | ||
| 99 | |||
| 100 | 20 | static inline void single_row_vector_path_2x( | |
| 101 | svbool_t pg, Rows<const uint8_t> src_rows, | ||
| 102 | Rows<uint8_t> dst_rows) KLEIDICV_STREAMING { | ||
| 103 | #if KLEIDICV_TARGET_SME2 | ||
| 104 | 5 | svcount_t pg_counter = svptrue_c8(); | |
| 105 | 5 | auto src = svld1_x2(pg_counter, &src_rows.at(0)[0]); | |
| 106 | 5 | svuint8_t line0 = svget2(src, 0); | |
| 107 | 5 | svuint8_t line1 = svget2(src, 1); | |
| 108 | #else | ||
| 109 | 15 | svuint8_t line0 = svld1(pg, &src_rows[0]); | |
| 110 | 15 | svuint8_t line1 = svld1_vnum(pg, &src_rows[0], 1); | |
| 111 | #endif // KLEIDICV_TARGET_SME2 | ||
| 112 | 20 | svuint8_t result0 = svrshrnb(svadalp_x(pg, svdup_u16(0), line0), 1); | |
| 113 | 20 | svuint8_t result1 = svrshrnb(svadalp_x(pg, svdup_u16(0), line1), 1); | |
| 114 | 20 | svst1b(pg, &dst_rows[0], svreinterpret_u16_u8(result0)); | |
| 115 | 20 | svst1b_vnum(pg, &dst_rows[0], 1, svreinterpret_u16_u8(result1)); | |
| 116 | 20 | } | |
| 117 | |||
| 118 | 48 | static inline void single_row_vector_path( | |
| 119 | svbool_t pg, Rows<const uint8_t> src_rows, | ||
| 120 | Rows<uint8_t> dst_rows) KLEIDICV_STREAMING { | ||
| 121 | 48 | svuint8_t line = svld1(pg, &src_rows.at(0)[0]); | |
| 122 | 48 | svuint8_t result = svrshrnb(svadalp_x(pg, svdup_u16(0), line), 1); | |
| 123 | 48 | svst1b(pg, &dst_rows[0], svreinterpret_u16_u8(result)); | |
| 124 | 48 | } | |
| 125 | |||
| 126 | template <typename ScalarType> | ||
| 127 | 68 | static inline void process_single_row(Rows<const ScalarType> src_rows, | |
| 128 | size_t src_width, | ||
| 129 | Rows<ScalarType> dst_rows, | ||
| 130 | size_t dst_width) KLEIDICV_STREAMING { | ||
| 131 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
| 132 | 68 | const size_t size_mask = ~static_cast<size_t>(1U); | |
| 133 | |||
| 134 | // Process rows up to the last even pixel index. | ||
| 135 | 136 | LoopUnroll2{src_width & size_mask, VecTraits::num_lanes()} | |
| 136 | // Process full vector chunks. | ||
| 137 | 88 | .unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
| 138 | 20 | auto pg = VecTraits::svptrue(); | |
| 139 | 40 | single_row_vector_path_2x(pg, src_rows.at(0, index), | |
| 140 | 20 | dst_rows.at(0, index / 2)); | |
| 141 | 20 | }) | |
| 142 | 80 | .unroll_once([&](size_t index) KLEIDICV_STREAMING { | |
| 143 | 12 | auto pg = VecTraits::svptrue(); | |
| 144 | 24 | single_row_vector_path(pg, src_rows.at(0, index), | |
| 145 | 12 | dst_rows.at(0, index / 2)); | |
| 146 | 12 | }) | |
| 147 | // Process the remaining chunk of the row. | ||
| 148 | 104 | .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
| 149 | 36 | auto pg = VecTraits::svwhilelt(index, length); | |
| 150 | 72 | single_row_vector_path(pg, src_rows.at(0, index), | |
| 151 | 36 | dst_rows.at(0, index / 2)); | |
| 152 | 36 | }); | |
| 153 | |||
| 154 | // Handle the last odd column, if any. | ||
| 155 |
2/2✓ Branch 0 taken 36 times.
✓ Branch 1 taken 32 times.
|
68 | if (dst_width > (src_width / 2)) { |
| 156 | 32 | dst_rows[dst_width - 1] = src_rows[src_width - 1]; | |
| 157 | 32 | } | |
| 158 | 68 | } | |
| 159 | |||
| 160 | KLEIDICV_TARGET_FN_ATTRS | ||
| 161 | 480 | static kleidicv_error_t check_dimensions(size_t src_dim, | |
| 162 | size_t dst_dim) KLEIDICV_STREAMING { | ||
| 163 | 480 | size_t half_src_dim = src_dim / 2; | |
| 164 | |||
| 165 |
2/2✓ Branch 0 taken 232 times.
✓ Branch 1 taken 248 times.
|
480 | if ((src_dim % 2) == 0) { |
| 166 |
2/2✓ Branch 0 taken 240 times.
✓ Branch 1 taken 8 times.
|
248 | if (dst_dim == half_src_dim) { |
| 167 | 240 | return KLEIDICV_OK; | |
| 168 | } | ||
| 169 | 8 | } else { | |
| 170 |
4/4✓ Branch 0 taken 136 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 128 times.
✓ Branch 3 taken 8 times.
|
232 | if (dst_dim == half_src_dim || dst_dim == (half_src_dim + 1)) { |
| 171 | 224 | return KLEIDICV_OK; | |
| 172 | } | ||
| 173 | } | ||
| 174 | |||
| 175 | 16 | return KLEIDICV_ERROR_RANGE; | |
| 176 | 480 | } | |
| 177 | |||
| 178 | 260 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_to_quarter_u8_sc( | |
| 179 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 180 | uint8_t *dst, size_t dst_stride, size_t dst_width, | ||
| 181 | size_t dst_height) KLEIDICV_STREAMING { | ||
| 182 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 256 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 256 times.
|
260 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 183 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 252 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 252 times.
|
256 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
| 184 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 248 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 244 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 244 times.
|
252 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 185 | |||
| 186 |
4/4✓ Branch 0 taken 8 times.
✓ Branch 1 taken 236 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 236 times.
|
252 | if (kleidicv_error_t ret = check_dimensions(src_width, dst_width)) { |
| 187 | 8 | return ret; | |
| 188 | } | ||
| 189 | |||
| 190 |
4/4✓ Branch 0 taken 8 times.
✓ Branch 1 taken 228 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 228 times.
|
244 | if (kleidicv_error_t ret = check_dimensions(src_height, dst_height)) { |
| 191 | 8 | return ret; | |
| 192 | } | ||
| 193 | |||
| 194 | 228 | Rows<const uint8_t> src_rows{src, src_stride, /* channels*/ 1}; | |
| 195 | 228 | Rows<uint8_t> dst_rows{dst, dst_stride, /* channels*/ 1}; | |
| 196 | 228 | LoopUnroll2 loop{src_height, /* Process two rows */ 2}; | |
| 197 | |||
| 198 | // Process two rows at once. | ||
| 199 | 812 | loop.unroll_once([&](size_t) // NOLINT(readability/casting) | |
| 200 | KLEIDICV_STREAMING { | ||
| 201 | 1168 | process_parallel_rows(src_rows, src_width, dst_rows, | |
| 202 | 584 | dst_width); | |
| 203 | 584 | src_rows += 2; | |
| 204 | 584 | ++dst_rows; | |
| 205 | 584 | }); | |
| 206 | |||
| 207 | // Handle an odd row, if any. | ||
| 208 |
2/2✓ Branch 0 taken 160 times.
✓ Branch 1 taken 68 times.
|
228 | if (dst_height > (src_height / 2)) { |
| 209 | 136 | loop.remaining([&](size_t, size_t) KLEIDICV_STREAMING { | |
| 210 | 68 | process_single_row(src_rows, src_width, dst_rows, dst_width); | |
| 211 | 68 | }); | |
| 212 | 68 | } | |
| 213 | 228 | return KLEIDICV_OK; | |
| 214 | 260 | } | |
| 215 | |||
| 216 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 217 | |||
| 218 | #endif // KLEIDICV_RESIZE_SC_H | ||
| 219 |