| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2026 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H | ||
| 6 | #define KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | #include <cstddef> | ||
| 10 | #include <memory> | ||
| 11 | |||
| 12 | #include "kleidicv/kleidicv.h" | ||
| 13 | #include "kleidicv/sve2.h" | ||
| 14 | |||
| 15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 16 | |||
| 17 | //------------------------------------------------------ | ||
| 18 | /// Generic resize for ratios 1/3 to 1/1, u8, 1channel | ||
| 19 | //------------------------------------------------------ | ||
| 20 | |||
| 21 | namespace resize_generic_u8 { | ||
| 22 | |||
| 23 | // For the coordinate calculation, fixed-point format is used, for better | ||
| 24 | // performance. Fixed-point format: | ||
| 25 | // - lowest 16 bits are the fractional part, that is the kFixpBits constant | ||
| 26 | // - at interpolation, the high 8 bits are used from the fractional part | ||
| 27 | // (this is a good compromise between accuracy and performance: because the | ||
| 28 | // result is 8bits, the error only affects the least significant 1-2 bits, see | ||
| 29 | // the accuracy calculation in kleidicv.h | ||
| 30 | // - to get the integer part, right shift by 16 bits, or zip/unzip/tbl etc. to | ||
| 31 | // get the bytes needed | ||
| 32 | // - for better accuracy, rounding is needed everywhere, i.e. adding 0.5, which | ||
| 33 | // is 1 << 15 | ||
| 34 | |||
| 35 | static constexpr ptrdiff_t kFixpBits = 16; | ||
| 36 | static constexpr ptrdiff_t kFixpHalf = (1UL << (kFixpBits - 1)); | ||
| 37 | |||
| 38 | // Precalc 1 item: | ||
| 39 | // Frac: 2 vectors u16 | ||
| 40 | // Idx: 1 vector u8 (left_idx) | ||
| 41 | // Src_index: uint64 (separate array) | ||
| 42 | template <size_t kRatio> | ||
| 43 | struct PrecalcIterator { | ||
| 44 | size_t index_; | ||
| 45 | uint64_t *src_index_ptr_; | ||
| 46 | const size_t kStep, kIdxFracStep; | ||
| 47 | uint8_t *idx_ptr_; | ||
| 48 | uint16_t *frac_ptr_; | ||
| 49 | 5267 | PrecalcIterator(size_t kStepDst, uint64_t *src_indices, | |
| 50 | uint8_t *p_idx_frac) KLEIDICV_STREAMING | ||
| 51 | 5267 | : index_{0}, | |
| 52 | 5267 | src_index_ptr_{src_indices}, | |
| 53 | 5267 | kStep{kStepDst}, | |
| 54 | 5267 | kIdxFracStep{kStep * (2 + 1)}, | |
| 55 | 5267 | idx_ptr_{p_idx_frac}, | |
| 56 | 5267 | frac_ptr_{reinterpret_cast<uint16_t *>(p_idx_frac + kStep)} {} | |
| 57 | |||
| 58 | 769817 | PrecalcIterator &operator++() KLEIDICV_STREAMING { | |
| 59 | 769817 | ++index_; | |
| 60 | 769817 | ++src_index_ptr_; | |
| 61 | 769817 | idx_ptr_ += kIdxFracStep; | |
| 62 | 769817 | frac_ptr_ += kIdxFracStep / 2; | |
| 63 | 769817 | return *this; | |
| 64 | } | ||
| 65 | }; | ||
| 66 | |||
| 67 | template <ptrdiff_t kRatio, ptrdiff_t kChannels> | ||
| 68 | class PrecalcIndicesFractions final { | ||
| 69 | public: | ||
| 70 | 457 | PrecalcIndicesFractions(size_t src_width, size_t dst_width, | |
| 71 | ptrdiff_t kStep) KLEIDICV_STREAMING | ||
| 72 | 457 | : src_width_{src_width}, | |
| 73 | 457 | dst_width_{dst_width}, | |
| 74 | 457 | n_iterations_{0}, | |
| 75 | 457 | n_iterations_2x_{0}, | |
| 76 | 457 | kStep_{kStep}, | |
| 77 | 457 | precalc_src_bases_{nullptr, &std::free}, | |
| 78 | 457 | precalc_idx_frac_{nullptr, &std::free} {} | |
| 79 | |||
| 80 | 5267 | PrecalcIterator<kRatio> begin() const KLEIDICV_STREAMING { | |
| 81 | 10534 | return PrecalcIterator<kRatio>(kStep_, precalc_src_bases_.get(), | |
| 82 | 5267 | precalc_idx_frac_.get()); | |
| 83 | } | ||
| 84 | |||
| 85 | 318 | bool precalculate_indices_fractions_srcindices() KLEIDICV_STREAMING { | |
| 86 |
8/8✓ Branch 0 taken 72 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 81 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 72 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 81 times.
✓ Branch 7 taken 3 times.
|
318 | if (!allocate_temp_buffers()) { |
| 87 | 12 | return false; | |
| 88 | } | ||
| 89 | |||
| 90 | // These starting values are not aligned to center. The center alignment | ||
| 91 | // must be added only once. When added to a center-aligned source_x | ||
| 92 | // value, the result will be center-aligned. | ||
| 93 | 306 | svuint32_t vsx0b = make_vsx0(0); | |
| 94 | 306 | svuint32_t vsx0t = make_vsx0(1); | |
| 95 | 306 | svuint32_t vsx1b = make_vsx0(2 * svcntw()); | |
| 96 | 306 | svuint32_t vsx1t = make_vsx0(2 * svcntw() + 1); | |
| 97 | // from each even 16bit element, take the low byte, and the high is 0 | ||
| 98 | 612 | svuint8_t vsxfrac_bottom_tbl = | |
| 99 | 306 | svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004)); | |
| 100 | // from each odd 16bit element, take the low byte, and the high is 0 | ||
| 101 | 612 | svuint8_t vsxfrac_top_tbl = | |
| 102 | 306 | svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004)); | |
| 103 | |||
| 104 | 612 | svuint8_t vchannels = svreinterpret_u8_u32( | |
| 105 | 306 | svdup_n_u32(kChannels == 4 ? 0x03020100U : 0x01000100)); | |
| 106 | |||
| 107 | // Difference in source x coordinate, for one vector path | ||
| 108 | 612 | const uint64_t sx_fixp_step = rounding_div( | |
| 109 | 306 | ((src_width_ * kStep_ / kChannels) << kFixpBits), dst_width_); | |
| 110 | 306 | uint64_t sx_fixp = to_src_x(0); | |
| 111 | 612 | const uint64_t max_src_index = | |
| 112 | 306 | std::max(src_width_ * kChannels - kStep_ * kRatio, 0UL); | |
| 113 | // For 1,2,4 channels dx can be iterated vector by vector, but not for 3 | ||
| 114 | 306 | ptrdiff_t dx = 0; | |
| 115 |
8/8✓ Branch 0 taken 29827 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 59729 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 30434 times.
✓ Branch 5 taken 72 times.
✓ Branch 6 taken 61005 times.
✓ Branch 7 taken 81 times.
|
181301 | for (auto pcit = begin(); pcit.index_ < n_iterations_; |
| 116 | 180995 | ++pcit, dx += kStep_ / kChannels) { | |
| 117 | // Repeatedly adding sx_fixp_vector_step is faster than multiplication, | ||
| 118 | // but it accumulates fixed-point error; periodic recalibration resets | ||
| 119 | // it. The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1 | ||
| 120 | // << 16). Only the upper 8 bits of the 16-bit fractional part are used | ||
| 121 | // for interpolation, so once the accumulated error reaches 1 / (1 << | ||
| 122 | // 8), it can affect later stages. This corresponds to 512 additions, | ||
| 123 | // which is calculated by this mask. | ||
| 124 | 180995 | constexpr uint64_t kRecalibrateCycleMask = ((1 << 9) - 1); | |
| 125 |
8/8✓ Branch 0 taken 29699 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 59534 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 30305 times.
✓ Branch 5 taken 129 times.
✓ Branch 6 taken 60808 times.
✓ Branch 7 taken 197 times.
|
180995 | if ((pcit.index_ & kRecalibrateCycleMask) == 0) { |
| 126 | 649 | sx_fixp = to_src_x(dx); | |
| 127 | 649 | } | |
| 128 | |||
| 129 |
8/8✓ Branch 0 taken 29796 times.
✓ Branch 1 taken 31 times.
✓ Branch 2 taken 59668 times.
✓ Branch 3 taken 61 times.
✓ Branch 4 taken 30405 times.
✓ Branch 5 taken 29 times.
✓ Branch 6 taken 60948 times.
✓ Branch 7 taken 57 times.
|
180995 | n_iterations_2x_ = (sx_fixp >> kFixpBits) * kChannels <= max_src_index |
| 130 | 180817 | ? pcit.index_ | |
| 131 | 178 | : n_iterations_2x_; | |
| 132 | 180995 | calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b, vsx0t, vsx1b, | |
| 133 | vsx1t, vsxfrac_bottom_tbl, | ||
| 134 | vsxfrac_top_tbl, vchannels); | ||
| 135 | 180995 | sx_fixp += sx_fixp_step; | |
| 136 | 180995 | } | |
| 137 | 306 | return true; | |
| 138 | 318 | } | |
| 139 | |||
| 140 | 139 | bool precalculate_indices_fractions_srcindices_3ch() KLEIDICV_STREAMING { | |
| 141 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 69 times.
|
139 | if (!allocate_temp_buffers()) { |
| 142 | 6 | return false; | |
| 143 | } | ||
| 144 | |||
| 145 | // These starting values are not aligned to center. The center alignment | ||
| 146 | // must be added only once. When added to a center-aligned source_x | ||
| 147 | // value, the result will be center-aligned. | ||
| 148 | 133 | svuint32_t vsx0b_R = make_vsx0(0); | |
| 149 | 133 | svuint32_t vsx0t_R = make_vsx0(1); | |
| 150 | 133 | svuint32_t vsx1b_R = make_vsx0(2 * svcntw()); | |
| 151 | 133 | svuint32_t vsx1t_R = make_vsx0(2 * svcntw() + 1); | |
| 152 | |||
| 153 | 133 | svuint32_t vsx0b_G = make_vsx0(4 * svcntw()); | |
| 154 | 133 | svuint32_t vsx0t_G = make_vsx0(4 * svcntw() + 1); | |
| 155 | 133 | svuint32_t vsx1b_G = make_vsx0(6 * svcntw()); | |
| 156 | 133 | svuint32_t vsx1t_G = make_vsx0(6 * svcntw() + 1); | |
| 157 | |||
| 158 | 133 | svuint32_t vsx0b_B = make_vsx0(8 * svcntw()); | |
| 159 | 133 | svuint32_t vsx0t_B = make_vsx0(8 * svcntw() + 1); | |
| 160 | 133 | svuint32_t vsx1b_B = make_vsx0(10 * svcntw()); | |
| 161 | 133 | svuint32_t vsx1t_B = make_vsx0(10 * svcntw() + 1); | |
| 162 | |||
| 163 | 133 | size_t kVL = svcntb(); | |
| 164 | 133 | svuint8_t vchannels_R = svindex_u8(0, 1); | |
| 165 | 133 | svuint8_t vchannels_G = svindex_u8(kVL % 3, 1); | |
| 166 | 133 | svuint8_t vchannels_B = svindex_u8((kVL + kVL) % 3, 1); | |
| 167 | // Decrease by 3 while they are >= 3 --> so we get the modulo | ||
| 168 | 133 | size_t steps = (kVL - 1) / 3; | |
| 169 |
4/4✓ Branch 0 taken 992 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 1081 times.
✓ Branch 3 taken 69 times.
|
2206 | for (size_t i = 0; i < steps; ++i) { |
| 170 | 4146 | vchannels_R = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_R, 3), | |
| 171 | 2073 | vchannels_R, 3); | |
| 172 | 4146 | vchannels_G = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_G, 3), | |
| 173 | 2073 | vchannels_G, 3); | |
| 174 | 4146 | vchannels_B = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_B, 3), | |
| 175 | 2073 | vchannels_B, 3); | |
| 176 | 2073 | } | |
| 177 | |||
| 178 | // from each even 16bit element, take the low byte, and the high is 0 | ||
| 179 | 266 | svuint8_t vsxfrac_bottom_tbl = | |
| 180 | 133 | svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004)); | |
| 181 | // from each odd 16bit element, take the low byte, and the high is 0 | ||
| 182 | 266 | svuint8_t vsxfrac_top_tbl = | |
| 183 | 133 | svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004)); | |
| 184 | |||
| 185 | // Difference in source x coordinate, for three vector paths (one iteration | ||
| 186 | // in this calculation) | ||
| 187 | 266 | const uint64_t sx_fixp_step3 = | |
| 188 | 133 | rounding_div((src_width_ * kStep_) << kFixpBits, dst_width_); | |
| 189 | 133 | uint64_t sx_fixp = to_src_x(0); | |
| 190 | 266 | const uint64_t max_src_index = | |
| 191 | 133 | std::max(src_width_ * kChannels - kStep_ * kRatio, 0UL); | |
| 192 | 133 | ptrdiff_t dx = 0; | |
| 193 | 133 | auto pcit = begin(); | |
| 194 |
4/4✓ Branch 0 taken 29 times.
✓ Branch 1 taken 29837 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 30590 times.
|
60474 | while (pcit.index_ < n_iterations_) { |
| 195 | // Repeatedly adding sx_fixp_vector_step is faster than multiplication, | ||
| 196 | // but it accumulates fixed-point error; periodic recalibration resets | ||
| 197 | // it. The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1 | ||
| 198 | // << 16). Only the upper 8 bits of the 16-bit fractional part are used | ||
| 199 | // for interpolation, so once the accumulated error reaches 1 / (1 << | ||
| 200 | // 8), it can affect later stages. This corresponds to 512 additions, | ||
| 201 | // but it will trigger each 3rd time, so the mask should be set to 128. | ||
| 202 | 60427 | constexpr uint64_t kRecalibrateCycleMask = ((1 << 7) - 1); | |
| 203 |
4/4✓ Branch 0 taken 29544 times.
✓ Branch 1 taken 293 times.
✓ Branch 2 taken 30286 times.
✓ Branch 3 taken 304 times.
|
60427 | if ((pcit.index_ & kRecalibrateCycleMask) == 0) { |
| 204 | 597 | sx_fixp = to_src_x(dx); | |
| 205 | 597 | } | |
| 206 | |||
| 207 | 60427 | calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_R, vsx0t_R, | |
| 208 | vsx1b_R, vsx1t_R, vsxfrac_bottom_tbl, | ||
| 209 | vsxfrac_top_tbl, vchannels_R); | ||
| 210 |
4/4✓ Branch 0 taken 29822 times.
✓ Branch 1 taken 15 times.
✓ Branch 2 taken 30580 times.
✓ Branch 3 taken 10 times.
|
60427 | n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index |
| 211 | 60402 | ? pcit.index_ | |
| 212 | 25 | : n_iterations_2x_; | |
| 213 | 60427 | ++pcit; | |
| 214 |
4/4✓ Branch 0 taken 29 times.
✓ Branch 1 taken 29808 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 30580 times.
|
60427 | if (pcit.index_ >= n_iterations_) { |
| 215 | 39 | break; | |
| 216 | } | ||
| 217 | 60388 | calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_G, vsx0t_G, | |
| 218 | vsx1b_G, vsx1t_G, vsxfrac_bottom_tbl, | ||
| 219 | vsxfrac_top_tbl, vchannels_G); | ||
| 220 |
4/4✓ Branch 0 taken 29790 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 30537 times.
✓ Branch 3 taken 43 times.
|
60388 | n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index |
| 221 | 60327 | ? pcit.index_ | |
| 222 | 61 | : n_iterations_2x_; | |
| 223 | 60388 | ++pcit; | |
| 224 |
4/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 29802 times.
✓ Branch 2 taken 41 times.
✓ Branch 3 taken 30539 times.
|
60388 | if (pcit.index_ >= n_iterations_) { |
| 225 | 47 | break; | |
| 226 | } | ||
| 227 | 60341 | calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_B, vsx0t_B, | |
| 228 | vsx1b_B, vsx1t_B, vsxfrac_bottom_tbl, | ||
| 229 | vsxfrac_top_tbl, vchannels_B); | ||
| 230 |
4/4✓ Branch 0 taken 29771 times.
✓ Branch 1 taken 31 times.
✓ Branch 2 taken 30519 times.
✓ Branch 3 taken 20 times.
|
60341 | n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index |
| 231 | 60290 | ? pcit.index_ | |
| 232 | 51 | : n_iterations_2x_; | |
| 233 | 60341 | ++pcit; | |
| 234 | 60341 | sx_fixp += sx_fixp_step3; | |
| 235 | 60341 | dx += kStep_; | |
| 236 | 60427 | } | |
| 237 | 133 | return true; | |
| 238 | 139 | } | |
| 239 | |||
| 240 | 17206 | size_t n_iterations() const KLEIDICV_STREAMING { return n_iterations_; } | |
| 241 | 202472 | size_t n_iterations_2x() const KLEIDICV_STREAMING { return n_iterations_2x_; } | |
| 242 | uint64_t *src_bases() const KLEIDICV_STREAMING { | ||
| 243 | return precalc_src_bases_.get(); | ||
| 244 | } | ||
| 245 | uint8_t *idx_frac() const KLEIDICV_STREAMING { | ||
| 246 | return precalc_idx_frac_.get(); | ||
| 247 | } | ||
| 248 | |||
| 249 | private: | ||
| 250 | using FreeDeleter = decltype(&std::free); | ||
| 251 | |||
| 252 | 457 | bool allocate_temp_buffers() KLEIDICV_STREAMING { | |
| 253 | // Allocate a bit more so don't have to care about overindexing | ||
| 254 | 457 | ptrdiff_t rounded_width = align_up(dst_width_ * kChannels, kStep_); | |
| 255 | 457 | n_iterations_ = rounded_width / kStep_; | |
| 256 | 457 | size_t idx_bytes = sizeof(uint8_t) * rounded_width; | |
| 257 | 457 | size_t xfrac_bytes = sizeof(uint16_t) * rounded_width; | |
| 258 | 914 | precalc_idx_frac_.reset( | |
| 259 | 457 | static_cast<uint8_t *>(malloc(idx_bytes + xfrac_bytes))); | |
| 260 | 457 | size_t src_bases_bytes = sizeof(uint64_t) * rounded_width / kStep_; | |
| 261 | 457 | precalc_src_bases_.reset(static_cast<uint64_t *>(malloc(src_bases_bytes))); | |
| 262 | 1371 | return (reinterpret_cast<uintptr_t>(precalc_idx_frac_.get()) & | |
| 263 | 457 | reinterpret_cast<uintptr_t>(precalc_src_bases_.get())); | |
| 264 | 457 | } | |
| 265 | |||
| 266 | template <typename T = uint64_t> | ||
| 267 | 35868 | static T rounding_div(uint64_t nom, uint64_t denom) KLEIDICV_STREAMING { | |
| 268 | 35868 | return static_cast<T>((nom + denom / 2) / denom); | |
| 269 | } | ||
| 270 | |||
| 271 | // Scale coordinate using this formula, so the center is aligned: | ||
| 272 | // source_x = (destination_x + 0.5) / scale - 0.5; | ||
| 273 | // plus 1/256/2 for later rounding the fractional part to 8bits | ||
| 274 | 1685 | static uint64_t aligned_scale(uint64_t x, uint64_t nom, | |
| 275 | uint64_t denom) KLEIDICV_STREAMING { | ||
| 276 | 3370 | return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) - | |
| 277 | 1685 | kFixpHalf + (1 << (kFixpBits - 9)); | |
| 278 | } | ||
| 279 | |||
| 280 | 1685 | uint64_t to_src_x(uint64_t dx) const KLEIDICV_STREAMING { | |
| 281 | 1685 | return aligned_scale(dx, src_width_, dst_width_); | |
| 282 | } | ||
| 283 | |||
| 284 | // Scale destination x coordinate to source x coordinate, into fixed-point, | ||
| 285 | // without center correction | ||
| 286 | 33744 | uint32_t scale_x(uint64_t dx) const KLEIDICV_STREAMING { | |
| 287 | 33744 | return rounding_div<uint32_t>(((dx * src_width_) << kFixpBits), dst_width_); | |
| 288 | } | ||
| 289 | |||
| 290 | 2820 | svuint32_t make_vsx0(uint64_t dx) const KLEIDICV_STREAMING { | |
| 291 | // Creates source x coordinates starting with dx, stepping by 2 | ||
| 292 | // and finally shifted left by 8, to support the later svaddhn operation | ||
| 293 | 2820 | uint32_t sx[64]; // maximum possible vector length in u32 units | |
| 294 |
12/12✓ Branch 0 taken 288 times.
✓ Branch 1 taken 3456 times.
✓ Branch 2 taken 324 times.
✓ Branch 3 taken 3888 times.
✓ Branch 4 taken 768 times.
✓ Branch 5 taken 9120 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 3456 times.
✓ Branch 8 taken 324 times.
✓ Branch 9 taken 3888 times.
✓ Branch 10 taken 828 times.
✓ Branch 11 taken 9936 times.
|
36564 | for (size_t i = 0; i < svcntw(); ++i) { |
| 295 | 33744 | sx[i] = scale_x((dx + 2 * i) / kChannels) << 8; | |
| 296 | 33744 | } | |
| 297 | 5640 | return svld1(svptrue_b32(), sx); | |
| 298 | 2820 | } | |
| 299 | |||
| 300 | 362151 | void calculate_indices_fractions_srcindex( | |
| 301 | PrecalcIterator<kRatio> &pcit, uint64_t sx_fixp, const svuint32_t &vsx0b, | ||
| 302 | const svuint32_t &vsx0t, const svuint32_t &vsx1b, const svuint32_t &vsx1t, | ||
| 303 | const svuint8_t &vsxfrac_bottom_tbl, const svuint8_t &vsxfrac_top_tbl, | ||
| 304 | [[maybe_unused]] const svuint8_t &vchannels) const KLEIDICV_STREAMING { | ||
| 305 | // << 8: to prepare for addhn, have the fractional part in the high half | ||
| 306 | 724302 | uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp & ((1 << kFixpBits) - 1)) | |
| 307 | 362151 | << 8; | |
| 308 | // get the interesting part: 8+8 bits of integer and fractional part | ||
| 309 | 724302 | svuint16x2_t vsx_delta = | |
| 310 | 724302 | svcreate2(svaddhnt_n_u32(svaddhnb_n_u32(vsx0b, xfrac0), vsx0t, xfrac0), | |
| 311 | 362151 | svaddhnt_n_u32(svaddhnb_n_u32(vsx1b, xfrac0), vsx1t, xfrac0)); | |
| 312 | if constexpr (kChannels == 3) { | ||
| 313 | // When vsx0 starts from other than zero, this offset must be subtracted | ||
| 314 | 181156 | uint16_t start{}; | |
| 315 | 181156 | svst1(svptrue_pat_b16(SV_VL1), &start, svget2(vsx_delta, 0)); | |
| 316 | 181156 | start = start & 0xFF00; | |
| 317 | 181156 | vsx_delta = | |
| 318 | 362312 | svcreate2(svsub_n_u16_x(svptrue_b16(), svget2(vsx_delta, 0), start), | |
| 319 | 181156 | svsub_n_u16_x(svptrue_b16(), svget2(vsx_delta, 1), start)); | |
| 320 | 181156 | sx_fixp += (start >> 8) << kFixpBits; | |
| 321 | 181156 | } | |
| 322 | 724302 | svuint8x2_t vsx_delta8 = | |
| 323 | 724302 | svcreate2(svreinterpret_u8_u16(svget2(vsx_delta, 0)), | |
| 324 | 362151 | svreinterpret_u8_u16(svget2(vsx_delta, 1))); | |
| 325 | // left pixels' indices: integer part | ||
| 326 | 724302 | svuint8_t vsx_left_idx = | |
| 327 | 362151 | svuzp2_u8(svget2(vsx_delta8, 0), svget2(vsx_delta8, 1)); | |
| 328 | if constexpr (kChannels > 1) { | ||
| 329 | if constexpr (kChannels == 3) { | ||
| 330 | 181156 | vsx_left_idx = svmul_n_u8_x(svptrue_b8(), vsx_left_idx, 3); | |
| 331 | } else { | ||
| 332 | static_assert(kChannels == 2 || kChannels == 4); | ||
| 333 | 120734 | vsx_left_idx = | |
| 334 | 120734 | svlsl_n_u8_x(svptrue_b8(), vsx_left_idx, kChannels == 4 ? 2 : 1); | |
| 335 | } | ||
| 336 | 301890 | vsx_left_idx = svadd_u8_x(svptrue_b8(), vsx_left_idx, vchannels); | |
| 337 | } | ||
| 338 | |||
| 339 | 362151 | uint64_t srcindex = (sx_fixp >> kFixpBits) * kChannels; | |
| 340 | if constexpr (kChannels == 3) { | ||
| 341 | // When vsx_left_idx starts from other than zero, this offset must be | ||
| 342 | // subtracted | ||
| 343 | 181156 | uint8_t start{}; | |
| 344 | 181156 | svst1(svptrue_pat_b8(SV_VL1), &start, vsx_left_idx); | |
| 345 | 181156 | vsx_left_idx = svsub_n_u8_x(svptrue_b8(), vsx_left_idx, start); | |
| 346 | 181156 | srcindex += start; | |
| 347 | 181156 | } | |
| 348 | |||
| 349 | 362151 | *pcit.src_index_ptr_ = srcindex; | |
| 350 | 362151 | svst1(svptrue_b8(), pcit.idx_ptr_, vsx_left_idx); | |
| 351 | |||
| 352 | // fractional part is widened to 16 bits for further operations | ||
| 353 | 724302 | svuint16_t vsxfrac_b = | |
| 354 | 362151 | svreinterpret_u16_u8(svtbl2_u8(vsx_delta8, vsxfrac_bottom_tbl)); | |
| 355 | 724302 | svuint16_t vsxfrac_t = | |
| 356 | 362151 | svreinterpret_u16_u8(svtbl2_u8(vsx_delta8, vsxfrac_top_tbl)); | |
| 357 | 362151 | svst1(svptrue_b16(), pcit.frac_ptr_, vsxfrac_b); | |
| 358 | 362151 | svst1_vnum(svptrue_b16(), pcit.frac_ptr_, 1, vsxfrac_t); | |
| 359 | 362151 | } | |
| 360 | |||
| 361 | const size_t src_width_; | ||
| 362 | const size_t dst_width_; | ||
| 363 | size_t n_iterations_; | ||
| 364 | size_t n_iterations_2x_; | ||
| 365 | const ptrdiff_t kStep_; | ||
| 366 | std::unique_ptr<uint64_t, FreeDeleter> precalc_src_bases_; | ||
| 367 | std::unique_ptr<uint8_t, FreeDeleter> precalc_idx_frac_; | ||
| 368 | }; | ||
| 369 | |||
| 370 | // ratio: number of vectors to load and resize to 1 vector | ||
| 371 | // - supported combinations of (ratio, channel): | ||
| 372 | // (2, 1), (2, 2), (2, 3), (3, 1), (3, 2), (3, 3) | ||
| 373 | template <ptrdiff_t kRatio, ptrdiff_t kChannels> | ||
| 374 | class ResizeGenericU8Operation final { | ||
| 375 | public: | ||
| 376 | 457 | ResizeGenericU8Operation(const uint8_t *src, size_t src_stride, | |
| 377 | size_t src_width, size_t src_height, size_t y_begin, | ||
| 378 | size_t y_end, | ||
| 379 | uint8_t *dst, // NOLINT | ||
| 380 | size_t dst_stride, size_t dst_width, | ||
| 381 | size_t dst_height) KLEIDICV_STREAMING | ||
| 382 | 457 | : src_rows_{src, src_stride, kChannels}, | |
| 383 | 457 | dst_rows_{dst, dst_stride, kChannels}, | |
| 384 | 457 | src_width_{src_width}, | |
| 385 | 457 | src_height_{src_height}, | |
| 386 | 457 | y_begin_{y_begin}, | |
| 387 | 457 | y_end_{y_end}, | |
| 388 | 457 | dst_width_{dst_width}, | |
| 389 | 457 | dst_height_{dst_height}, | |
| 390 | 457 | kStep_{static_cast<ptrdiff_t>(svcntb())}, | |
| 391 | 457 | precalc_{src_width, dst_width, kStep_} {} | |
| 392 | |||
| 393 | 457 | kleidicv_error_t process_rows() KLEIDICV_STREAMING { | |
| 394 | 457 | bool precalc_success = false; | |
| 395 | if constexpr (kChannels == 3) { | ||
| 396 | 139 | precalc_success = | |
| 397 | 139 | precalc_.precalculate_indices_fractions_srcindices_3ch(); | |
| 398 | } else { | ||
| 399 | 318 | precalc_success = precalc_.precalculate_indices_fractions_srcindices(); | |
| 400 | } | ||
| 401 |
12/12✓ Branch 0 taken 72 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 81 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 64 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 72 times.
✓ Branch 7 taken 3 times.
✓ Branch 8 taken 81 times.
✓ Branch 9 taken 3 times.
✓ Branch 10 taken 69 times.
✓ Branch 11 taken 3 times.
|
457 | if (!precalc_success) { |
| 402 | 18 | return KLEIDICV_ERROR_ALLOCATION; | |
| 403 | } | ||
| 404 | |||
| 405 |
12/12✓ Branch 0 taken 672 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1032 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 964 times.
✓ Branch 5 taken 64 times.
✓ Branch 6 taken 498 times.
✓ Branch 7 taken 72 times.
✓ Branch 8 taken 858 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 804 times.
✓ Branch 11 taken 69 times.
|
5267 | for (uint64_t dst_y = y_begin_; dst_y < y_end_; ++dst_y) { |
| 406 | 4828 | process_row(dst_y); | |
| 407 | 4828 | } | |
| 408 | |||
| 409 | 439 | return KLEIDICV_OK; | |
| 410 | 457 | } | |
| 411 | |||
| 412 | private: | ||
| 413 | template <typename T = uint64_t> | ||
| 414 | 4828 | static T rounding_div(uint64_t nom, uint64_t denom) KLEIDICV_STREAMING { | |
| 415 | 4828 | return static_cast<T>((nom + denom / 2) / denom); | |
| 416 | } | ||
| 417 | |||
| 418 | // Scale coordinate using this formula, so the center is aligned: | ||
| 419 | // source_x = (destination_x + 0.5) / scale - 0.5; | ||
| 420 | // plus 1/256/2 for later rounding the fractional part to 8bits | ||
| 421 | 4828 | static uint64_t aligned_scale(uint64_t x, uint64_t nom, | |
| 422 | uint64_t denom) KLEIDICV_STREAMING { | ||
| 423 | 9656 | return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) - | |
| 424 | 4828 | kFixpHalf + (1 << (kFixpBits - 9)); | |
| 425 | } | ||
| 426 | |||
| 427 | 4828 | uint64_t to_src_y(uint64_t dy) const KLEIDICV_STREAMING { | |
| 428 | 4828 | return aligned_scale(dy, src_height_, dst_height_); | |
| 429 | } | ||
| 430 | |||
| 431 | 1222998 | static svuint16_t svshll8b(svuint8_t a) KLEIDICV_STREAMING { | |
| 432 | 1222998 | return svreinterpret_u16_u8(svtrn1(svdup_n_u8(0), a)); | |
| 433 | } | ||
| 434 | 1222998 | static svuint16_t svshll8t(svuint8_t a) KLEIDICV_STREAMING { | |
| 435 | 1222998 | return svreinterpret_u16_u8(svtrn2(svdup_n_u8(0), a)); | |
| 436 | } | ||
| 437 | |||
| 438 | 398404 | static svuint8x2_t load8x2_u8(const uint8_t *p) KLEIDICV_STREAMING { | |
| 439 | #if KLEIDICV_TARGET_SME2 | ||
| 440 | 64120 | return svld1_x2(svptrue_c8(), p); | |
| 441 | #else | ||
| 442 | 334284 | return svcreate2(svld1(svptrue_b8(), p), svld1_vnum(svptrue_b8(), p, 1)); | |
| 443 | #endif | ||
| 444 | } | ||
| 445 | |||
| 446 | 15268 | svuint8x2_t load8x2_while_u8(const uint8_t *p, uint64_t i, | |
| 447 | uint64_t n) const KLEIDICV_STREAMING { | ||
| 448 | #if KLEIDICV_TARGET_SME2 | ||
| 449 | 5264 | return svld1_x2(svwhilelt_c8(i, n, 2), p); | |
| 450 | #else | ||
| 451 | 10004 | svbool_t pg1 = svwhilelt_b8(i, n); | |
| 452 | 10004 | svbool_t pg2 = svwhilelt_b8(i + kStep_, n); | |
| 453 | 20008 | return svcreate2(svld1(pg1, p), svld1_vnum(pg2, p, 1)); | |
| 454 | #endif | ||
| 455 | 10004 | } | |
| 456 | |||
| 457 | 392172 | static svuint8x3_t load8x3_u8(const uint8_t *p) KLEIDICV_STREAMING { | |
| 458 | #if KLEIDICV_TARGET_SME2 | ||
| 459 | 64196 | svuint8x2_t sv2 = svld1_x2(svptrue_c8(), p); | |
| 460 | 192588 | return svcreate3(svget2(sv2, 0), svget2(sv2, 1), | |
| 461 | 64196 | svld1_vnum(svptrue_b8(), p, 2)); | |
| 462 | #else | ||
| 463 | 655952 | return svcreate3(svld1(svptrue_b8(), p), svld1_vnum(svptrue_b8(), p, 1), | |
| 464 | 327976 | svld1_vnum(svptrue_b8(), p, 2)); | |
| 465 | #endif | ||
| 466 | 64196 | } | |
| 467 | |||
| 468 | 9488 | svuint8x3_t load8x3_while_u8(const uint8_t *p, uint64_t i, | |
| 469 | uint64_t n) const KLEIDICV_STREAMING { | ||
| 470 | #if KLEIDICV_TARGET_SME2 | ||
| 471 | 3050 | svcount_t pgc = svwhilelt_c8(i, n, 2); | |
| 472 | 3050 | svbool_t pgb = svwhilelt_b8(i + 2 * kStep_, n); | |
| 473 | 3050 | svuint8x2_t sv2 = svld1_x2(pgc, p); | |
| 474 | 6100 | return svcreate3(svget2(sv2, 0), svget2(sv2, 1), svld1_vnum(pgb, p, 2)); | |
| 475 | #else | ||
| 476 | 6438 | svbool_t pg1 = svwhilelt_b8(i, n); | |
| 477 | 6438 | svbool_t pg2 = svwhilelt_b8(i + kStep_, n); | |
| 478 | 6438 | svbool_t pg3 = svwhilelt_b8(i + 2 * kStep_, n); | |
| 479 | 19314 | return svcreate3(svld1(pg1, p), svld1_vnum(pg2, p, 1), | |
| 480 | 6438 | svld1_vnum(pg3, p, 2)); | |
| 481 | #endif | ||
| 482 | 9488 | } | |
| 483 | |||
| 484 | 407666 | svuint8_t interpolate(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, | |
| 485 | svuint8_t a, svuint8_t b, svuint8_t c, | ||
| 486 | svuint8_t d) const KLEIDICV_STREAMING { | ||
| 487 | #if KLEIDICV_TARGET_SME2 | ||
| 488 | 68315 | svuint16x2_t vsxfrac = svld1_x2(svptrue_c8(), pcit.frac_ptr_); | |
| 489 | 68315 | svuint16_t vsxfrac_b = svget2(vsxfrac, 0); | |
| 490 | 68315 | svuint16_t vsxfrac_t = svget2(vsxfrac, 1); | |
| 491 | #else | ||
| 492 | 339351 | svuint16_t vsxfrac_b = svld1(svptrue_b16(), pcit.frac_ptr_); | |
| 493 | 339351 | svuint16_t vsxfrac_t = svld1_vnum(svptrue_b16(), pcit.frac_ptr_, 1); | |
| 494 | #endif | ||
| 495 | 407666 | svuint16_t half = svdup_n_u16(128); | |
| 496 | 815332 | svuint8_t left = svaddhnb( | |
| 497 | 407666 | svshll8b(a), svmla_n_u16_x(svptrue_b16(), half, svsublb(c, a), yfrac)); | |
| 498 | 815332 | svuint8_t right = svaddhnb( | |
| 499 | 407666 | svshll8b(b), svmla_n_u16_x(svptrue_b16(), half, svsublb(d, b), yfrac)); | |
| 500 | 815332 | left = svaddhnt(left, svshll8t(a), | |
| 501 | 407666 | svmla_n_u16_x(svptrue_b16(), half, svsublt(c, a), yfrac)); | |
| 502 | 815332 | right = svaddhnt(right, svshll8t(b), | |
| 503 | 407666 | svmla_n_u16_x(svptrue_b16(), half, svsublt(d, b), yfrac)); | |
| 504 | |||
| 505 | 815332 | svuint8_t res = | |
| 506 | 815332 | svaddhnb(svshll8b(left), | |
| 507 | 407666 | svmla_x(svptrue_b16(), half, svsublb(right, left), vsxfrac_b)); | |
| 508 | 815332 | return svaddhnt( | |
| 509 | 407666 | res, svshll8t(left), | |
| 510 | 407666 | svmla_x(svptrue_b16(), half, svsublt(right, left), vsxfrac_t)); | |
| 511 | 407666 | } | |
| 512 | |||
| 513 | 206836 | svuint8_t common_vector_path_r2( | |
| 514 | const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, svuint8x2_t topsrc, | ||
| 515 | svuint8x2_t bottomsrc) const KLEIDICV_STREAMING { | ||
| 516 | 206836 | svuint8_t vsx0_idx = svld1(svptrue_b8(), pcit.idx_ptr_); | |
| 517 | 206836 | svuint8_t vsx1_idx = svadd_n_u8_x(svptrue_b8(), vsx0_idx, kChannels); | |
| 518 | 206836 | svuint8_t a = svtbl2_u8(topsrc, vsx0_idx); | |
| 519 | 206836 | svuint8_t b = svtbl2_u8(topsrc, vsx1_idx); | |
| 520 | 206836 | svuint8_t c = svtbl2_u8(bottomsrc, vsx0_idx); | |
| 521 | 206836 | svuint8_t d = svtbl2_u8(bottomsrc, vsx1_idx); | |
| 522 | 413672 | return interpolate(pcit, yfrac, a, b, c, d); | |
| 523 | 206836 | } | |
| 524 | |||
| 525 | 199202 | svuint8_t vector_path_r2(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, | |
| 526 | const uint8_t *src_top, | ||
| 527 | const uint8_t *src_bottom) const KLEIDICV_STREAMING { | ||
| 528 | // Load 2*step elements, that's enough for 1/2 < scale < 1.0 | ||
| 529 | 199202 | uint64_t src_index = *pcit.src_index_ptr_; | |
| 530 | 199202 | svuint8x2_t topsrc = load8x2_u8(&src_top[src_index]); | |
| 531 | 199202 | svuint8x2_t bottomsrc = load8x2_u8(&src_bottom[src_index]); | |
| 532 | 398404 | return common_vector_path_r2(pcit, yfrac, topsrc, bottomsrc); | |
| 533 | 199202 | } | |
| 534 | |||
| 535 | 7634 | svuint8_t remaining_path_r2(const PrecalcIterator<kRatio> &pcit, | |
| 536 | uint16_t yfrac, const uint8_t *src_top, | ||
| 537 | const uint8_t *src_bottom) const | ||
| 538 | KLEIDICV_STREAMING { | ||
| 539 | // Load 2*step elements, that's enough for 1/2 < scale < 1.0 | ||
| 540 | 7634 | uint64_t src_index = *pcit.src_index_ptr_; | |
| 541 | 15268 | svuint8x2_t topsrc = load8x2_while_u8(&src_top[src_index], src_index, | |
| 542 | 7634 | src_width_ * kChannels); | |
| 543 | 15268 | svuint8x2_t bottomsrc = load8x2_while_u8(&src_bottom[src_index], src_index, | |
| 544 | 7634 | src_width_ * kChannels); | |
| 545 | 15268 | return common_vector_path_r2(pcit, yfrac, topsrc, bottomsrc); | |
| 546 | 7634 | } | |
| 547 | |||
| 548 | 200830 | svuint8_t common_vector_path_r3( | |
| 549 | const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, svuint8x3_t topsrc, | ||
| 550 | svuint8x3_t bottomsrc) const KLEIDICV_STREAMING { | ||
| 551 | 200830 | svuint8_t vsx0_idx = svld1(svptrue_b8(), pcit.idx_ptr_); | |
| 552 | 200830 | svuint8_t vsx1_idx = svadd_n_u8_x(svptrue_b8(), vsx0_idx, kChannels); | |
| 553 | 401660 | svuint8_t a = | |
| 554 | 200830 | svtbl2_u8(svcreate2(svget3(topsrc, 0), svget3(topsrc, 1)), vsx0_idx); | |
| 555 | 401660 | svuint8_t b = | |
| 556 | 200830 | svtbl2_u8(svcreate2(svget3(topsrc, 0), svget3(topsrc, 1)), vsx1_idx); | |
| 557 | 602490 | svuint8_t c = svtbl2_u8( | |
| 558 | 401660 | svcreate2(svget3(bottomsrc, 0), svget3(bottomsrc, 1)), vsx0_idx); | |
| 559 | 602490 | svuint8_t d = svtbl2_u8( | |
| 560 | 401660 | svcreate2(svget3(bottomsrc, 0), svget3(bottomsrc, 1)), vsx1_idx); | |
| 561 | |||
| 562 | 200830 | vsx0_idx = | |
| 563 | 200830 | svsub_n_u8_x(svptrue_b8(), vsx0_idx, static_cast<uint8_t>(2 * kStep_)); | |
| 564 | 200830 | vsx1_idx = | |
| 565 | 200830 | svsub_n_u8_x(svptrue_b8(), vsx1_idx, static_cast<uint8_t>(2 * kStep_)); | |
| 566 | 200830 | a = svtbx_u8(a, svget3(topsrc, 2), vsx0_idx); | |
| 567 | 200830 | b = svtbx_u8(b, svget3(topsrc, 2), vsx1_idx); | |
| 568 | 200830 | c = svtbx_u8(c, svget3(bottomsrc, 2), vsx0_idx); | |
| 569 | 200830 | d = svtbx_u8(d, svget3(bottomsrc, 2), vsx1_idx); | |
| 570 | 401660 | return interpolate(pcit, yfrac, a, b, c, d); | |
| 571 | 200830 | } | |
| 572 | |||
| 573 | 196086 | svuint8_t vector_path_r3(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, | |
| 574 | const uint8_t *src_top, | ||
| 575 | const uint8_t *src_bottom) const KLEIDICV_STREAMING { | ||
| 576 | // Load 3*2*step elements, that's enough for 1/3 < scale < 1.0 | ||
| 577 | 196086 | uint64_t src_index = *pcit.src_index_ptr_; | |
| 578 | 196086 | svuint8x3_t topsrc = load8x3_u8(&src_top[src_index]); | |
| 579 | 196086 | svuint8x3_t bottomsrc = load8x3_u8(&src_bottom[src_index]); | |
| 580 | 392172 | return common_vector_path_r3(pcit, yfrac, topsrc, bottomsrc); | |
| 581 | 196086 | } | |
| 582 | |||
| 583 | 4744 | svuint8_t remaining_path_r3(const PrecalcIterator<kRatio> &pcit, | |
| 584 | uint16_t yfrac, const uint8_t *src_top, | ||
| 585 | const uint8_t *src_bottom) const | ||
| 586 | KLEIDICV_STREAMING { | ||
| 587 | // Load 3*step elements, that's enough for 1/3 < scale < 1.0 | ||
| 588 | 4744 | uint64_t src_index = *pcit.src_index_ptr_; | |
| 589 | 9488 | svuint8x3_t topsrc = load8x3_while_u8(&src_top[src_index], src_index, | |
| 590 | 4744 | src_width_ * kChannels); | |
| 591 | 9488 | svuint8x3_t bottomsrc = load8x3_while_u8(&src_bottom[src_index], src_index, | |
| 592 | 4744 | src_width_ * kChannels); | |
| 593 | 9488 | return common_vector_path_r3(pcit, yfrac, topsrc, bottomsrc); | |
| 594 | 4744 | } | |
| 595 | |||
| 596 | 4828 | void process_row(uint64_t dy) const KLEIDICV_STREAMING { | |
| 597 | 4828 | uint64_t sy_fixp = to_src_y(dy); | |
| 598 | 4828 | ptrdiff_t sy = static_cast<ptrdiff_t>(sy_fixp >> kFixpBits); | |
| 599 | 4828 | const uint8_t *src_top = &src_rows_.at(sy)[0]; | |
| 600 | 4828 | const uint8_t *src_bottom = &src_rows_.at(sy + 1)[0]; | |
| 601 | 4828 | uint8_t *dst = &dst_rows_.at(static_cast<ptrdiff_t>(dy))[0]; | |
| 602 | 4828 | uint8_t *dst_end = dst + dst_width_ * kChannels; | |
| 603 | // Get the highest 8 bits of the fractional part | ||
| 604 | // This is a good compromise between accuracy and performance | ||
| 605 | // Because the result is 8bits, the error only affects the least | ||
| 606 | // significant 1-2 bits, see the accuracy calculation in kleidicv.h | ||
| 607 | 9656 | uint16_t yfrac = | |
| 608 | 4828 | static_cast<uint16_t>((sy_fixp - (sy << kFixpBits)) >> (kFixpBits - 8)); | |
| 609 | 4828 | auto pcit = precalc_.begin(); | |
| 610 |
12/12✓ Branch 0 taken 15815 times.
✓ Branch 1 taken 672 times.
✓ Branch 2 taken 33519 times.
✓ Branch 3 taken 1032 times.
✓ Branch 4 taken 50267 times.
✓ Branch 5 taken 964 times.
✓ Branch 6 taken 15414 times.
✓ Branch 7 taken 498 times.
✓ Branch 8 taken 32692 times.
✓ Branch 9 taken 858 times.
✓ Branch 10 taken 49937 times.
✓ Branch 11 taken 804 times.
|
202472 | while (pcit.index_ + 1 < precalc_.n_iterations_2x()) { |
| 611 | 197644 | svuint8_t res0, res1; | |
| 612 | if constexpr (kRatio == 3) { | ||
| 613 | 98043 | res0 = vector_path_r3(pcit, yfrac, src_top, src_bottom); | |
| 614 | 98043 | ++pcit; | |
| 615 | 98043 | res1 = vector_path_r3(pcit, yfrac, src_top, src_bottom); | |
| 616 | 98043 | ++pcit; | |
| 617 | } else if constexpr (kRatio == 2) { | ||
| 618 | 99601 | res0 = vector_path_r2(pcit, yfrac, src_top, src_bottom); | |
| 619 | 99601 | ++pcit; | |
| 620 | 99601 | res1 = vector_path_r2(pcit, yfrac, src_top, src_bottom); | |
| 621 | 99601 | ++pcit; | |
| 622 | } | ||
| 623 | #if KLEIDICV_TARGET_SME2 | ||
| 624 | 32079 | svst1(svptrue_c8(), dst, svcreate2(res0, res1)); | |
| 625 | #else | ||
| 626 | 165565 | svst1(svptrue_b8(), dst, res0); | |
| 627 | 165565 | svst1_vnum(svptrue_b8(), dst, 1, res1); | |
| 628 | #endif // KLEIDICV_TARGET_SME2 | ||
| 629 | 197644 | dst += 2 * kStep_; | |
| 630 | 197644 | } | |
| 631 | |||
| 632 | // similar to above, but only a single vector path and with predicates | ||
| 633 |
12/12✓ Branch 0 taken 1895 times.
✓ Branch 1 taken 672 times.
✓ Branch 2 taken 2635 times.
✓ Branch 3 taken 1032 times.
✓ Branch 4 taken 3104 times.
✓ Branch 5 taken 964 times.
✓ Branch 6 taken 968 times.
✓ Branch 7 taken 498 times.
✓ Branch 8 taken 2062 times.
✓ Branch 9 taken 858 times.
✓ Branch 10 taken 1714 times.
✓ Branch 11 taken 804 times.
|
17206 | while (pcit.index_ < precalc_.n_iterations()) { |
| 634 | 12378 | svbool_t pgdst = svwhilelt_b8(0L, dst_end - dst); | |
| 635 | 12378 | svuint8_t res; | |
| 636 | if constexpr (kRatio == 2) { | ||
| 637 | 7634 | res = remaining_path_r2(pcit, yfrac, src_top, src_bottom); | |
| 638 | } else if constexpr (kRatio == 3) { | ||
| 639 | 4744 | res = remaining_path_r3(pcit, yfrac, src_top, src_bottom); | |
| 640 | } | ||
| 641 | 12378 | svst1(pgdst, dst, res); | |
| 642 | 12378 | ++pcit; | |
| 643 | 12378 | dst += kStep_; | |
| 644 | 12378 | } | |
| 645 | 4828 | } | |
| 646 | |||
| 647 | const Rows<const uint8_t> src_rows_; | ||
| 648 | const Rows<uint8_t> dst_rows_; | ||
| 649 | const size_t src_width_; | ||
| 650 | const size_t src_height_; | ||
| 651 | const size_t y_begin_; | ||
| 652 | const size_t y_end_; | ||
| 653 | const size_t dst_width_; | ||
| 654 | const size_t dst_height_; | ||
| 655 | const ptrdiff_t kStep_; | ||
| 656 | PrecalcIndicesFractions<kRatio, kChannels> precalc_; | ||
| 657 | }; | ||
| 658 | |||
| 659 | } // namespace resize_generic_u8 | ||
| 660 | |||
| 661 | // ratio: number of vectors to load and resize to 1 vector | ||
| 662 | // - supported combinations of (ratio, channel): (2, 1), (2, 2), (3, 1), (3, | ||
| 663 | // 2) | ||
| 664 | template <ptrdiff_t kRatio, ptrdiff_t kChannels> | ||
| 665 | 457 | kleidicv_error_t kleidicv_resize_generic_stripe_u8_sc( | |
| 666 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 667 | size_t y_begin, size_t y_end, | ||
| 668 | uint8_t *dst, // NOLINT | ||
| 669 | size_t dst_stride, size_t dst_width, size_t dst_height) KLEIDICV_STREAMING { | ||
| 670 | 914 | resize_generic_u8::ResizeGenericU8Operation<kRatio, kChannels> operation( | |
| 671 | 457 | src, src_stride, src_width, src_height, y_begin, y_end, dst, dst_stride, | |
| 672 | 457 | dst_width, dst_height); | |
| 673 | 457 | return operation.process_rows(); | |
| 674 | 457 | } | |
| 675 | |||
| 676 | #define KLEIDICV_INSTANTIATE_TEMPLATE_SC(ratio, channels) \ | ||
| 677 | template kleidicv_error_t \ | ||
| 678 | kleidicv_resize_generic_stripe_u8_sc<ratio, channels>( \ | ||
| 679 | const uint8_t *src, size_t src_stride, size_t src_width, \ | ||
| 680 | size_t src_height, size_t y_begin, size_t y_end, uint8_t *dst, \ | ||
| 681 | size_t dst_stride, size_t dst_width, size_t dst_height) \ | ||
| 682 | KLEIDICV_STREAMING | ||
| 683 | |||
| 684 | KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 1L); | ||
| 685 | KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 2L); | ||
| 686 | KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 3L); | ||
| 687 | KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 1L); | ||
| 688 | KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 2L); | ||
| 689 | KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 3L); | ||
| 690 | |||
| 691 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 692 | |||
| 693 | #endif // KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H | ||
| 694 |