| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2026 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <cstddef> | ||
| 7 | #include <cstdint> | ||
| 8 | #include <cstdlib> | ||
| 9 | #include <memory> | ||
| 10 | #include <utility> | ||
| 11 | #include <variant> | ||
| 12 | |||
| 13 | #include "kleidicv/ctypes.h" | ||
| 14 | #include "kleidicv/neon.h" | ||
| 15 | #include "kleidicv/utils.h" | ||
| 16 | |||
| 17 | namespace kleidicv::neon::resize_linear_generic_u8 { | ||
| 18 | |||
| 19 | //------------------------------------------------------ | ||
| 20 | /// Generic resize for ratios 1/3 to 1/1, u8 | ||
| 21 | //------------------------------------------------------ | ||
| 22 | |||
| 23 | // For the coordinate calculation, fixed-point format is used, for better | ||
| 24 | // performance. Fixed-point format: | ||
| 25 | // - lowest 16 bits are the fractional part, that is the kFixpBits constant | ||
| 26 | // - at interpolation, the high 8 bits are used from the fractional part | ||
| 27 | // (this is a good compromise between accuracy and performance: because the | ||
| 28 | // result is 8bits, the error only affects the least significant 1-2 bits, see | ||
| 29 | // the accuracy calculation in kleidicv.h | ||
| 30 | // - to get the integer part, right shift by 16 bits, or zip/unzip/tbl etc. to | ||
| 31 | // get the bytes needed | ||
| 32 | // - for better accuracy, rounding is needed everywhere, i.e. adding 0.5, which | ||
| 33 | // is 1 << 15 | ||
| 34 | |||
| 35 | static constexpr ptrdiff_t kFixpBits = 16; | ||
| 36 | static constexpr ptrdiff_t kFixpHalf = (1UL << (kFixpBits - 1)); | ||
| 37 | static constexpr ptrdiff_t kStep = kVectorLength / sizeof(uint8_t); | ||
| 38 | static constexpr ptrdiff_t kHalfStep = kStep / 2; | ||
| 39 | |||
| 40 | struct FullVectorInterpolationConstants { | ||
| 41 | uint8_t idx[kStep]; | ||
| 42 | uint16_t xfrac[kStep]; | ||
| 43 | ptrdiff_t src_element_index; | ||
| 44 | }; | ||
| 45 | |||
| 46 | struct HalfVectorInterpolationConstants { | ||
| 47 | uint8_t idx[kHalfStep]; | ||
| 48 | uint16_t xfrac[kHalfStep]; | ||
| 49 | ptrdiff_t src_element_index; | ||
| 50 | ptrdiff_t dst_element_index; | ||
| 51 | }; | ||
| 52 | |||
| 53 | struct VectorPathNums { | ||
| 54 | size_t two_x; | ||
| 55 | size_t half; | ||
| 56 | |||
| 57 | 313 | explicit VectorPathNums(std::pair<size_t, size_t> sizes) | |
| 58 | 313 | : two_x{sizes.first}, half{sizes.second} {} | |
| 59 | }; | ||
| 60 | |||
| 61 | template <typename T = uint64_t> | ||
| 62 | 4050324 | static T rounding_div(uint64_t nom, uint64_t denom) { | |
| 63 | 4050324 | return static_cast<T>((nom + denom / 2) / denom); | |
| 64 | } | ||
| 65 | |||
| 66 | // Scale coordinate using this formula, so the center is aligned: | ||
| 67 | // source_x = (destination_x + 0.5) / scale - 0.5; | ||
| 68 | // plus 1/256/2 for later rounding the fractional part to 8bits | ||
| 69 | 5019 | static inline uint64_t aligned_scale(uint64_t x, uint64_t nom, uint64_t denom) { | |
| 70 | 5019 | return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) - kFixpHalf + | |
| 71 | (1 << (kFixpBits - 9)); | ||
| 72 | } | ||
| 73 | |||
| 74 | class RowInterpolationConstants { | ||
| 75 | public: | ||
| 76 | // Constructible only through create | ||
| 77 | RowInterpolationConstants() = delete; | ||
| 78 | |||
| 79 | 313 | static std::variant<RowInterpolationConstants, kleidicv_error_t> create( | |
| 80 | VectorPathNums num_of_vector_paths) { | ||
| 81 | { | ||
| 82 | 626 | uint8_t *allocation = static_cast<uint8_t *>(malloc( | |
| 83 | 313 | num_of_vector_paths.two_x * 2 * | |
| 84 | 313 | sizeof(FullVectorInterpolationConstants) + | |
| 85 | 313 | num_of_vector_paths.half * sizeof(HalfVectorInterpolationConstants))); | |
| 86 |
2/2✓ Branch 0 taken 301 times.
✓ Branch 1 taken 12 times.
|
313 | if (!allocation) { |
| 87 | 12 | return KLEIDICV_ERROR_ALLOCATION; | |
| 88 | } | ||
| 89 | |||
| 90 | 301 | return RowInterpolationConstants{num_of_vector_paths, allocation}; | |
| 91 | 313 | } | |
| 92 | return KLEIDICV_OK; | ||
| 93 | 313 | } | |
| 94 | |||
| 95 | 125003 | VectorPathNums num_of_vector_paths() const { return num_of_vector_paths_; } | |
| 96 | |||
| 97 | 486760 | FullVectorInterpolationConstants *full_vector_constants_array() const { | |
| 98 | 486760 | return full_vector_constants_array_; | |
| 99 | } | ||
| 100 | |||
| 101 | 3941 | HalfVectorInterpolationConstants *half_vector_constants_array() const { | |
| 102 | 3941 | return half_vector_constants_array_; | |
| 103 | } | ||
| 104 | |||
| 105 | private: | ||
| 106 | 301 | RowInterpolationConstants(VectorPathNums num_of_vector_paths, uint8_t *buffer) | |
| 107 | 301 | : buffer_{buffer, &std::free}, | |
| 108 | 602 | full_vector_constants_array_{ | |
| 109 | 301 | reinterpret_cast<FullVectorInterpolationConstants *>(buffer)}, | |
| 110 | 602 | half_vector_constants_array_{ | |
| 111 | reinterpret_cast<HalfVectorInterpolationConstants *>( | ||
| 112 | 602 | full_vector_constants_array_ + | |
| 113 | 301 | (num_of_vector_paths.two_x * 2))}, | |
| 114 | 301 | num_of_vector_paths_{num_of_vector_paths} {} | |
| 115 | |||
| 116 | using FreeDeleter = decltype(&std::free); | ||
| 117 | std::unique_ptr<uint8_t, FreeDeleter> buffer_; | ||
| 118 | FullVectorInterpolationConstants *const full_vector_constants_array_; | ||
| 119 | HalfVectorInterpolationConstants *const half_vector_constants_array_; | ||
| 120 | const VectorPathNums num_of_vector_paths_; | ||
| 121 | }; | ||
| 122 | |||
| 123 | template <ptrdiff_t kRatio, ptrdiff_t kChannels> | ||
| 124 | class RowInterpolationConstantsGeneratorBase { | ||
| 125 | protected: | ||
| 126 | 313 | RowInterpolationConstantsGeneratorBase(size_t src_width, size_t dst_width) | |
| 127 | 313 | : src_width_{src_width}, | |
| 128 | 313 | dst_width_{dst_width}, | |
| 129 | 313 | vsidx_tbl_{2, 6, 10, 14, 18, 22, 26, 30}, | |
| 130 | 313 | vsfrac_tbl_{1, 255, 5, 255, 9, 255, 13, 255, | |
| 131 | 313 | 17, 255, 21, 255, 25, 255, 29, 255} {} | |
| 132 | |||
| 133 | 313 | std::pair<size_t, size_t> calculate_num_of_vector_paths() { | |
| 134 |
8/12✓ Branch 0 taken 30 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 56 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 45 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 30 times.
✓ Branch 7 taken 20 times.
✓ Branch 8 taken 56 times.
✗ Branch 9 not taken.
✓ Branch 10 taken 56 times.
✗ Branch 11 not taken.
|
313 | size_t two_x = ((src_width_ * kChannels) >= (sizeof(uint8x16_t) * kRatio)) |
| 135 | 273 | ? ((dst_width_ * kChannels) / (2 * kStep)) | |
| 136 | : 0; | ||
| 137 | |||
| 138 | 626 | size_t remaining_dx_after_2x_cycle = | |
| 139 | 313 | (dst_width_ * kChannels) - (two_x * 2 * kStep); | |
| 140 | 313 | size_t half = align_up(remaining_dx_after_2x_cycle, kHalfStep) / kHalfStep; | |
| 141 | 313 | return {two_x, half}; | |
| 142 | 313 | } | |
| 143 | |||
| 144 | // Scale destination x coordinate to source x coordinate, into fixed-point, | ||
| 145 | // without center correction | ||
| 146 | 3882944 | uint32_t scale_x(uint64_t dx) const { | |
| 147 | 3882944 | return rounding_div<uint32_t>(((dx * src_width_) << kFixpBits), dst_width_); | |
| 148 | } | ||
| 149 | |||
| 150 | 1787 | uint64_t to_src_x(uint64_t dx) const { | |
| 151 | 1787 | return aligned_scale(dx, src_width_, dst_width_); | |
| 152 | } | ||
| 153 | |||
| 154 | const size_t src_width_; | ||
| 155 | const size_t dst_width_; | ||
| 156 | const uint8x8_t vsidx_tbl_; | ||
| 157 | const uint8x16_t vsfrac_tbl_; | ||
| 158 | }; | ||
| 159 | |||
| 160 | template <ptrdiff_t kRatio, ptrdiff_t kChannels> | ||
| 161 | class RowInterpolationConstantsGenerator final | ||
| 162 | : RowInterpolationConstantsGeneratorBase<kRatio, kChannels> { | ||
| 163 | public: | ||
| 164 | using Base = RowInterpolationConstantsGeneratorBase<kRatio, kChannels>; | ||
| 165 | 212 | RowInterpolationConstantsGenerator(size_t src_width, size_t dst_width) | |
| 166 | 212 | : Base{src_width, dst_width}, | |
| 167 | // These starting values are not aligned to center. The center alignment | ||
| 168 | // must be added only once. When added to a center-aligned source_x | ||
| 169 | // value, the result will be center-aligned. | ||
| 170 | 636 | vsx0_0_{Base::scale_x(0), Base::scale_x(1 / kChannels), | |
| 171 | 424 | Base::scale_x(2 / kChannels), Base::scale_x(3 / kChannels)}, | |
| 172 | 636 | vsx0_1_{Base::scale_x(4 / kChannels), Base::scale_x(5 / kChannels), | |
| 173 | 424 | Base::scale_x(6 / kChannels), Base::scale_x(7 / kChannels)}, | |
| 174 | 636 | vsx0_2_{Base::scale_x(8 / kChannels), Base::scale_x(9 / kChannels), | |
| 175 | 424 | Base::scale_x(10 / kChannels), Base::scale_x(11 / kChannels)}, | |
| 176 | 636 | vsx0_3_{Base::scale_x(12 / kChannels), Base::scale_x(13 / kChannels), | |
| 177 | 636 | Base::scale_x(14 / kChannels), Base::scale_x(15 / kChannels)} {} | |
| 178 | |||
| 179 | 212 | std::variant<RowInterpolationConstants, kleidicv_error_t> operator()() { | |
| 180 | 212 | VectorPathNums v{Base::calculate_num_of_vector_paths()}; | |
| 181 | 212 | auto row_interpolation_constants_variant = | |
| 182 | 212 | RowInterpolationConstants::create(v); | |
| 183 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 54 times.
|
212 | if (std::holds_alternative<kleidicv_error_t>( |
| 184 | row_interpolation_constants_variant)) { | ||
| 185 | // Creation failed with some error, return with the variant as it is | ||
| 186 | 8 | return row_interpolation_constants_variant; | |
| 187 | } | ||
| 188 | 204 | auto &row_interpolation_constants = *std::get_if<RowInterpolationConstants>( | |
| 189 | &row_interpolation_constants_variant); | ||
| 190 | |||
| 191 | 204 | uint64_t dx = 0; | |
| 192 | 204 | uint64_t sx_fixp = 0; | |
| 193 | |||
| 194 | // Calculate constants for full vectors | ||
| 195 | |||
| 196 | // Maximum source coordinate for vector path 2x | ||
| 197 | 408 | const uint64_t max_sx_2x = | |
| 198 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
612 | std::max(Base::src_width_ * kChannels - (sizeof(uint8x16_t) * kRatio), |
| 199 | 408 | 0UL) / | |
| 200 | kChannels; | ||
| 201 | // Difference in source x coordinate for one vector path | ||
| 202 | 408 | const uint64_t sx_fixp_vector_step = rounding_div( | |
| 203 | 204 | (Base::src_width_ * kStep / kChannels) << kFixpBits, Base::dst_width_); | |
| 204 | |||
| 205 |
8/8✓ Branch 0 taken 19838 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 39766 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 20242 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 40614 times.
✓ Branch 7 taken 54 times.
|
120664 | for (size_t i = 0; |
| 206 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
120664 | i < row_interpolation_constants.num_of_vector_paths().two_x; ++i) { |
| 207 | // Repeatedly adding sx_fixp_vector_step is faster than scaling dx to sx, | ||
| 208 | // but it accumulates fixed-point error; periodic recalibration resets it. | ||
| 209 | // The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1 << | ||
| 210 | // 16). Only the upper 8 bits of the 16-bit fractional part are used for | ||
| 211 | // interpolation, so once the accumulated error reaches 1 / (1 << 8), it | ||
| 212 | // can affect later stages. This corresponds to 512 additions. Since two | ||
| 213 | // additions are performed per cycle, we recalibrate every 256 cycles, | ||
| 214 | // calculated by this mask. | ||
| 215 | 120460 | constexpr uint64_t kRecalibrateCycleMask = ((1 << 8) - 1); | |
| 216 |
8/8✓ Branch 0 taken 19742 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 39570 times.
✓ Branch 3 taken 196 times.
✓ Branch 4 taken 20144 times.
✓ Branch 5 taken 98 times.
✓ Branch 6 taken 40414 times.
✓ Branch 7 taken 200 times.
|
120460 | if ((i & kRecalibrateCycleMask) == 0) { |
| 217 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
590 | sx_fixp = Base::to_src_x(dx); |
| 218 | 590 | } | |
| 219 | |||
| 220 | // Pull back sx if it would overrun | ||
| 221 | 120460 | uint64_t sx_candidate = sx_fixp >> kFixpBits; | |
| 222 | 120460 | uint64_t sx_base = std::min(max_sx_2x, sx_candidate); | |
| 223 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
120460 | calculate_indices_fractions_base_2x( |
| 224 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
120460 | row_interpolation_constants.full_vector_constants_array()[i * 2], |
| 225 | 120460 | sx_base, sx_fixp); | |
| 226 | 120460 | sx_fixp += sx_fixp_vector_step; | |
| 227 | 120460 | dx += kStep / kChannels; | |
| 228 | |||
| 229 | // Pull back sx if it would overrun | ||
| 230 | 120460 | sx_candidate = sx_fixp >> kFixpBits; | |
| 231 | 120460 | sx_base = std::min(max_sx_2x, sx_candidate); | |
| 232 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
120460 | calculate_indices_fractions_base_2x( |
| 233 | 120460 | row_interpolation_constants | |
| 234 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
120460 | .full_vector_constants_array()[(i * 2) + 1], |
| 235 | 120460 | sx_base, sx_fixp); | |
| 236 | 120460 | sx_fixp += sx_fixp_vector_step; | |
| 237 | 120460 | dx += kStep / kChannels; | |
| 238 | 120460 | } | |
| 239 | |||
| 240 | // Calculate constants for half vectors | ||
| 241 | |||
| 242 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
204 | sx_fixp = Base::to_src_x(dx); |
| 243 | |||
| 244 | // Difference in source x coordinate for one destination pixel | ||
| 245 | 408 | const uint64_t sx_fixp_one_dst_pixel = | |
| 246 | 204 | rounding_div(Base::src_width_ << kFixpBits, Base::dst_width_); | |
| 247 | // Maximum source coordinate for half vector path | ||
| 248 | 408 | const uint64_t max_sx_half = | |
| 249 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
612 | std::max(Base::src_width_ * kChannels - |
| 250 | |||
| 251 | (sizeof(uint8x16_t) * (kRatio - 1)), | ||
| 252 | 408 | 0UL) / | |
| 253 | kChannels; | ||
| 254 | // Maximum destination coordinate for half vector path | ||
| 255 | 204 | const uint64_t max_dx_half = Base::dst_width_ - (kHalfStep / kChannels); | |
| 256 | // Difference in source x coordinate for the half vector path | ||
| 257 | 408 | const uint64_t sx_fixp_half_step = | |
| 258 | 408 | rounding_div((Base::src_width_ * kHalfStep / kChannels) << kFixpBits, | |
| 259 | 204 | Base::dst_width_); | |
| 260 | |||
| 261 |
8/8✓ Branch 0 taken 94 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 118 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 112 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 160 times.
✓ Branch 7 taken 54 times.
|
688 | for (size_t i = 0; |
| 262 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
688 | i < row_interpolation_constants.num_of_vector_paths().half; ++i) { |
| 263 | // If (dx + half vector length) would overrun the buffer, pull it back | ||
| 264 | 484 | uint64_t dx_pulled_back = std::min(dx, max_dx_half); | |
| 265 | // Pull back sx if dx was pulled back | ||
| 266 | 484 | sx_fixp -= (dx - dx_pulled_back) * sx_fixp_one_dst_pixel; | |
| 267 | 484 | dx = dx_pulled_back; | |
| 268 | // If (sx_base + reading length) would overrun the buffer, pull sx back | ||
| 269 | // again | ||
| 270 | 484 | uint64_t sx_candidate = sx_fixp >> kFixpBits; | |
| 271 | 484 | uint64_t sx_base = std::min(max_sx_half, sx_candidate); | |
| 272 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
484 | calculate_indices_fractions_base_half( |
| 273 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
484 | row_interpolation_constants.half_vector_constants_array()[i], sx_base, |
| 274 | 484 | sx_fixp, dx); | |
| 275 | |||
| 276 | 484 | dx += kHalfStep / kChannels; | |
| 277 | 484 | sx_fixp += sx_fixp_half_step; | |
| 278 | 484 | } | |
| 279 | |||
| 280 | 204 | return row_interpolation_constants_variant; | |
| 281 | 212 | } | |
| 282 | |||
| 283 | private: | ||
| 284 | 240920 | void calculate_indices_fractions_base_2x( | |
| 285 | FullVectorInterpolationConstants &constants, uint64_t sx_base, | ||
| 286 | uint64_t sx_fixp) { | ||
| 287 | 240920 | uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp - (sx_base << kFixpBits)); | |
| 288 | 240920 | uint32x4_t vfrac = vdupq_n_u32(xfrac0); | |
| 289 | // Calculate x coordinate delta from sx_base, the integer part of source x | ||
| 290 | 240920 | uint8x16x2_t vsx_delta_lo, vsx_delta_hi; | |
| 291 | 240920 | vsx_delta_lo.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_0_, vfrac)); | |
| 292 | 240920 | vsx_delta_lo.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_1_, vfrac)); | |
| 293 | 240920 | vsx_delta_hi.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_2_, vfrac)); | |
| 294 | 240920 | vsx_delta_hi.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_3_, vfrac)); | |
| 295 | 240920 | uint8x8_t idx0 = vqtbl2_u8(vsx_delta_lo, Base::vsidx_tbl_); | |
| 296 | 240920 | uint8x8_t idx1 = vqtbl2_u8(vsx_delta_hi, Base::vsidx_tbl_); | |
| 297 | 240920 | uint8x16_t vsx0_idx = vcombine_u8(idx0, idx1); | |
| 298 | if constexpr (kChannels > 1) { | ||
| 299 | 160760 | vsx0_idx = vshlq_n_u8(vsx0_idx, kChannels == 4 ? 2 : 1); | |
| 300 | 160760 | vsx0_idx = | |
| 301 | 160760 | vaddq_u8(vsx0_idx, vreinterpretq_u8_u32(vdupq_n_u32( | |
| 302 | kChannels == 4 ? 0x03020100U : 0x01000100))); | ||
| 303 | } | ||
| 304 | 240920 | vst1q(constants.idx, vsx0_idx); | |
| 305 | 240920 | uint16x8x2_t vsxfrac; | |
| 306 | 240920 | vsxfrac.val[0] = | |
| 307 | 240920 | vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_lo, Base::vsfrac_tbl_)); | |
| 308 | 240920 | vsxfrac.val[1] = | |
| 309 | 240920 | vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_hi, Base::vsfrac_tbl_)); | |
| 310 | 240920 | VecTraits<uint16_t>::store(vsxfrac, constants.xfrac); | |
| 311 | 240920 | constants.src_element_index = static_cast<ptrdiff_t>(sx_base * kChannels); | |
| 312 | 240920 | } | |
| 313 | |||
| 314 | 484 | void calculate_indices_fractions_base_half( | |
| 315 | HalfVectorInterpolationConstants &constants, uint64_t sx_base, | ||
| 316 | uint64_t sx_fixp, uint64_t dx) { | ||
| 317 | 484 | uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp - (sx_base << kFixpBits)); | |
| 318 | 484 | uint32x4_t vfrac = vdupq_n_u32(xfrac0); | |
| 319 | 484 | uint8x16x2_t vsx_delta; | |
| 320 | 484 | vsx_delta.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_0_, vfrac)); | |
| 321 | 484 | vsx_delta.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_1_, vfrac)); | |
| 322 | 484 | uint8x8_t vsx0_idx = vqtbl2_u8(vsx_delta, Base::vsidx_tbl_); | |
| 323 | if constexpr (kChannels > 1) { | ||
| 324 | 278 | vsx0_idx = vshl_n_u8(vsx0_idx, kChannels == 4 ? 2 : 1); | |
| 325 | 278 | vsx0_idx = vadd_u8( | |
| 326 | 556 | vsx0_idx, vreinterpret_u8_u32( | |
| 327 | 278 | vdup_n_u32(kChannels == 4 ? 0x03020100U : 0x01000100))); | |
| 328 | } | ||
| 329 | 484 | vst1(constants.idx, vsx0_idx); | |
| 330 | 968 | uint16x8_t vsxfrac = | |
| 331 | 484 | vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta, Base::vsfrac_tbl_)); | |
| 332 | 484 | VecTraits<uint16_t>::store(vsxfrac, constants.xfrac); | |
| 333 | 484 | constants.src_element_index = static_cast<ptrdiff_t>(sx_base * kChannels); | |
| 334 | 484 | constants.dst_element_index = static_cast<ptrdiff_t>(dx * kChannels); | |
| 335 | 484 | } | |
| 336 | |||
| 337 | const uint32x4_t vsx0_0_; | ||
| 338 | const uint32x4_t vsx0_1_; | ||
| 339 | const uint32x4_t vsx0_2_; | ||
| 340 | const uint32x4_t vsx0_3_; | ||
| 341 | }; | ||
| 342 | |||
| 343 | template <ptrdiff_t kRatio> | ||
| 344 | class RowInterpolationConstantsGenerator<kRatio, 3> final | ||
| 345 | : RowInterpolationConstantsGeneratorBase<kRatio, 3> { | ||
| 346 | public: | ||
| 347 | using Base = RowInterpolationConstantsGeneratorBase<kRatio, 3>; | ||
| 348 | 101 | RowInterpolationConstantsGenerator(size_t src_width, size_t dst_width) | |
| 349 | 101 | : Base{src_width, dst_width}, | |
| 350 | 202 | sx_fixp_one_dst_pixel_{ | |
| 351 | 202 | rounding_div(src_width << kFixpBits, dst_width)} {} | |
| 352 | |||
| 353 | 101 | std::variant<RowInterpolationConstants, kleidicv_error_t> operator()() { | |
| 354 | 101 | VectorPathNums v{Base::calculate_num_of_vector_paths()}; | |
| 355 | 101 | auto row_interpolation_constants_variant = | |
| 356 | 101 | RowInterpolationConstants::create(v); | |
| 357 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.
|
101 | if (std::holds_alternative<kleidicv_error_t>( |
| 358 | row_interpolation_constants_variant)) { | ||
| 359 | // Creation failed with some error, return with the variant as it is | ||
| 360 | 4 | return row_interpolation_constants_variant; | |
| 361 | } | ||
| 362 | 97 | auto &row_interpolation_constants = *std::get_if<RowInterpolationConstants>( | |
| 363 | &row_interpolation_constants_variant); | ||
| 364 | |||
| 365 | 97 | uint64_t dst_element_index = 0; | |
| 366 | 97 | uint64_t sx_fixp{}; | |
| 367 | |||
| 368 | // Calculate constants for full vectors | ||
| 369 | |||
| 370 | 194 | size_t num_of_full_vector_constants = | |
| 371 | 97 | row_interpolation_constants.num_of_vector_paths().two_x * 2; | |
| 372 |
2/4✗ Branch 0 not taken.
✓ Branch 1 taken 43 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.
|
97 | if (num_of_full_vector_constants > 0) { |
| 373 | 97 | size_t handled_full_vector_paths = 0; | |
| 374 | |||
| 375 |
3/4✓ Branch 0 taken 14 times.
✓ Branch 1 taken 29 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.
|
97 | if (num_of_full_vector_constants > 3) { |
| 376 | 166 | size_t num_of_vector_paths_wout_pullback = | |
| 377 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
83 | get_num_of_vector_paths_wout_pullback(num_of_full_vector_constants); |
| 378 | // Handle 3 vectors at a time, that way in pixel index is known at | ||
| 379 | // compile time | ||
| 380 | 166 | size_t vector_path_triplets_wout_pullback = | |
| 381 | 83 | num_of_vector_paths_wout_pullback / 3; | |
| 382 | |||
| 383 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
83 | sx_fixp = Base::to_src_x(0); |
| 384 | 83 | unsigned recalibrate_cnt = 0; | |
| 385 |
4/4✓ Branch 0 taken 39686 times.
✓ Branch 1 taken 29 times.
✓ Branch 2 taken 41138 times.
✓ Branch 3 taken 54 times.
|
80907 | for (size_t i = 0; i < vector_path_triplets_wout_pullback; ++i) { |
| 386 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | const uint32x4x4_t vsx_r = gen_vsx_r(); |
| 387 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | const uint8x16_t vsx_idx_diff_r = gen_vsx_idx_diff_r(); |
| 388 | |||
| 389 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | const uint32x4x4_t vsx_g = gen_vsx_g(); |
| 390 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | const uint8x16_t vsx_idx_diff_g = gen_vsx_idx_diff_g(); |
| 391 | |||
| 392 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | const uint32x4x4_t vsx_b = gen_vsx_b(); |
| 393 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | const uint8x16_t vsx_idx_diff_b = gen_vsx_idx_diff_b(); |
| 394 | |||
| 395 | // Difference in source x coordinate for 5 destination pixels | ||
| 396 | 161648 | const uint64_t sx_fixp_five_dst_pixel = rounding_div( | |
| 397 | 80824 | (Base::src_width_ * 5) << kFixpBits, Base::dst_width_); | |
| 398 | // Difference in source x coordinate for 6 destination pixels | ||
| 399 | 161648 | const uint64_t sx_fixp_six_dst_pixel = rounding_div( | |
| 400 | 80824 | (Base::src_width_ * 6) << kFixpBits, Base::dst_width_); | |
| 401 | |||
| 402 | // Repeatedly adding sx_fixp_five_dst_pixel and sx_fixp_six_dst_pixel | ||
| 403 | // is faster than scaling dx to sx, but it accumulates fixed-point | ||
| 404 | // error; periodic recalibration resets it. The maximum per-addition | ||
| 405 | // error of these values is 0.5 / (1 << 16). Only the upper 8 | ||
| 406 | // bits of the 16-bit fractional part are used for interpolation, so | ||
| 407 | // once the accumulated error reaches 1 / (1 << 8), it can affect | ||
| 408 | // later stages. This corresponds to 512 additions. Since three | ||
| 409 | // additions are performed per cycle, we recalibrate every 170 cycles. | ||
| 410 |
4/4✓ Branch 0 taken 228 times.
✓ Branch 1 taken 39458 times.
✓ Branch 2 taken 234 times.
✓ Branch 3 taken 40904 times.
|
80824 | if (recalibrate_cnt == 170) { |
| 411 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
462 | sx_fixp = Base::to_src_x(dst_element_index / 3); |
| 412 | 462 | recalibrate_cnt = 0; | |
| 413 | 462 | } else { | |
| 414 | 80362 | recalibrate_cnt++; | |
| 415 | } | ||
| 416 | |||
| 417 | 80824 | unsigned in_pixel_index = 0; | |
| 418 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | fill_full_constants_vectorially( |
| 419 | 80824 | row_interpolation_constants | |
| 420 | 80824 | .full_vector_constants_array()[handled_full_vector_paths], | |
| 421 | 80824 | vsx_r, vsx_idx_diff_r, sx_fixp, in_pixel_index); | |
| 422 | |||
| 423 | 80824 | sx_fixp += sx_fixp_five_dst_pixel; | |
| 424 | 80824 | in_pixel_index = 1; | |
| 425 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | fill_full_constants_vectorially( |
| 426 | 80824 | row_interpolation_constants | |
| 427 | 80824 | .full_vector_constants_array()[handled_full_vector_paths + 1], | |
| 428 | 80824 | vsx_g, vsx_idx_diff_g, sx_fixp, in_pixel_index); | |
| 429 | |||
| 430 | 80824 | sx_fixp += sx_fixp_five_dst_pixel; | |
| 431 | 80824 | in_pixel_index = 2; | |
| 432 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
80824 | fill_full_constants_vectorially( |
| 433 | 80824 | row_interpolation_constants | |
| 434 | 80824 | .full_vector_constants_array()[handled_full_vector_paths + 2], | |
| 435 | 80824 | vsx_b, vsx_idx_diff_b, sx_fixp, in_pixel_index); | |
| 436 | |||
| 437 | 80824 | sx_fixp += sx_fixp_six_dst_pixel; | |
| 438 | 80824 | handled_full_vector_paths += 3; | |
| 439 | 80824 | dst_element_index += kStep * 3; | |
| 440 | 80824 | } | |
| 441 | 83 | } | |
| 442 | |||
| 443 |
4/4✓ Branch 0 taken 74 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 62 times.
✓ Branch 3 taken 54 times.
|
233 | while (handled_full_vector_paths < num_of_full_vector_constants) { |
| 444 | 272 | auto &constants = | |
| 445 | 136 | row_interpolation_constants | |
| 446 | 136 | .full_vector_constants_array()[handled_full_vector_paths]; | |
| 447 | // Maximum source coordinate for full vector path | ||
| 448 | 272 | const uint64_t max_src_base_index = std::max( | |
| 449 | 136 | (Base::src_width_ * kChannels) - (sizeof(uint8x16_t) * kRatio), | |
| 450 | 136 | 0UL); | |
| 451 | |||
| 452 | 136 | uint64_t dx = dst_element_index / kChannels; | |
| 453 | 136 | unsigned in_pixel_index = dst_element_index % kChannels; | |
| 454 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
136 | sx_fixp = Base::to_src_x(dx); |
| 455 | |||
| 456 | 272 | uint64_t src_element_index = | |
| 457 | 136 | ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index; | |
| 458 | |||
| 459 | // Pull back src if it would overrun | ||
| 460 | 272 | uint64_t src_element_base = | |
| 461 | 136 | std::min(max_src_base_index, src_element_index); | |
| 462 | |||
| 463 |
0/8✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
272 | fill_full_constants_scalarly(constants, in_pixel_index, |
| 464 | 136 | src_element_index, src_element_base, | |
| 465 | 136 | sx_fixp); | |
| 466 | 136 | handled_full_vector_paths++; | |
| 467 | 136 | dst_element_index += kStep; | |
| 468 | 136 | } | |
| 469 | 97 | } | |
| 470 | |||
| 471 | // Calculate constants for half vectors | ||
| 472 | |||
| 473 | // Maximum source coordinate for half vector path | ||
| 474 | 97 | uint64_t half_vector_path_src_read_size = | |
| 475 | kChannels == 3 ? sizeof(uint8x16x2_t) | ||
| 476 | : (sizeof(uint8x16_t) * (kRatio - 1)); | ||
| 477 | 194 | const uint64_t max_src_base_index = std::max( | |
| 478 | 97 | Base::src_width_ * kChannels - half_vector_path_src_read_size, 0UL); | |
| 479 | // Maximum destination coordinate for half vector path | ||
| 480 | 194 | const uint64_t max_dst_index_half = | |
| 481 | 97 | (Base::dst_width_ * kChannels) - kHalfStep; | |
| 482 | |||
| 483 |
4/4✓ Branch 0 taken 87 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 138 times.
✓ Branch 3 taken 54 times.
|
322 | for (size_t i = 0; |
| 484 | 322 | i < row_interpolation_constants.num_of_vector_paths().half; ++i) { | |
| 485 | 450 | auto &constants = | |
| 486 | 225 | row_interpolation_constants.half_vector_constants_array()[i]; | |
| 487 | |||
| 488 | // If (dst index + half vector length) would overrun the buffer, pull it | ||
| 489 | // back | ||
| 490 | 225 | dst_element_index = std::min(dst_element_index, max_dst_index_half); | |
| 491 | |||
| 492 | 225 | uint64_t dx = dst_element_index / kChannels; | |
| 493 | 225 | unsigned in_pixel_index = dst_element_index % kChannels; | |
| 494 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
225 | sx_fixp = Base::to_src_x(dx); |
| 495 | 450 | uint64_t src_element_index = | |
| 496 | 225 | ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index; | |
| 497 | |||
| 498 | // Pull back src if it would overrun | ||
| 499 | 450 | uint64_t src_element_base = | |
| 500 | 225 | std::min(max_src_base_index, src_element_index); | |
| 501 | |||
| 502 |
0/8✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
450 | fill_half_constants_scalarly(constants, dst_element_index, in_pixel_index, |
| 503 | 225 | src_element_index, src_element_base, | |
| 504 | 225 | sx_fixp); | |
| 505 | |||
| 506 | 225 | dst_element_index += kHalfStep; | |
| 507 | 225 | } | |
| 508 | |||
| 509 | 97 | return row_interpolation_constants_variant; | |
| 510 | 101 | } | |
| 511 | |||
| 512 | private: | ||
| 513 | 83 | size_t get_num_of_vector_paths_wout_pullback( | |
| 514 | size_t num_of_full_vector_constants) { | ||
| 515 | 170 | auto vector_needs_pullback = [this](size_t dst_idx) { | |
| 516 | 87 | unsigned in_pixel_idx = dst_idx % kChannels; | |
| 517 | 87 | uint64_t dx = dst_idx / kChannels; | |
| 518 | 87 | uint64_t sx_fixp = Base::to_src_x(dx); | |
| 519 | 87 | uint64_t src_idx = ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_idx; | |
| 520 | |||
| 521 | 174 | return (src_idx + (kStep * kRatio)) > (Base::src_width_ * kChannels); | |
| 522 | 87 | }; | |
| 523 | |||
| 524 |
2/4✓ Branch 0 taken 29 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 54 times.
✗ Branch 3 not taken.
|
83 | if (num_of_full_vector_constants == 0) { |
| 525 | ✗ | return 0; | |
| 526 | } | ||
| 527 | |||
| 528 | 166 | size_t candidate_last_vector_wout_pullback = | |
| 529 | 83 | num_of_full_vector_constants - 1; | |
| 530 | |||
| 531 | 83 | do { | |
| 532 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 29 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.
|
87 | if (!vector_needs_pullback(candidate_last_vector_wout_pullback * kStep)) { |
| 533 | 83 | break; | |
| 534 | } | ||
| 535 | 4 | candidate_last_vector_wout_pullback--; | |
| 536 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
|
4 | } while (candidate_last_vector_wout_pullback > 0); |
| 537 | |||
| 538 |
2/4✓ Branch 0 taken 29 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 54 times.
✗ Branch 3 not taken.
|
83 | if (candidate_last_vector_wout_pullback == 0) { |
| 539 | ✗ | if (vector_needs_pullback(candidate_last_vector_wout_pullback * kStep)) { | |
| 540 | ✗ | return 0; | |
| 541 | } | ||
| 542 | } | ||
| 543 | |||
| 544 | 83 | return candidate_last_vector_wout_pullback + 1; | |
| 545 | 83 | } | |
| 546 | |||
| 547 | 80824 | uint32x4x4_t gen_vsx_r() { | |
| 548 | 161648 | return uint32x4x4_t{ | |
| 549 | 323296 | Base::scale_x(0), Base::scale_x(0), Base::scale_x(0), Base::scale_x(1), | |
| 550 | 80824 | Base::scale_x(1), Base::scale_x(1), Base::scale_x(2), Base::scale_x(2), | |
| 551 | 80824 | Base::scale_x(2), Base::scale_x(3), Base::scale_x(3), Base::scale_x(3), | |
| 552 | 80824 | Base::scale_x(4), Base::scale_x(4), Base::scale_x(4), Base::scale_x(5)}; | |
| 553 | } | ||
| 554 | 80824 | uint8x16_t gen_vsx_idx_diff_r() { | |
| 555 | 80824 | return uint8x16_t{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}; | |
| 556 | } | ||
| 557 | |||
| 558 | 80824 | uint32x4x4_t gen_vsx_g() { | |
| 559 | 161648 | return uint32x4x4_t{ | |
| 560 | 323296 | Base::scale_x(0), Base::scale_x(0), Base::scale_x(1), Base::scale_x(1), | |
| 561 | 80824 | Base::scale_x(1), Base::scale_x(2), Base::scale_x(2), Base::scale_x(2), | |
| 562 | 80824 | Base::scale_x(3), Base::scale_x(3), Base::scale_x(3), Base::scale_x(4), | |
| 563 | 80824 | Base::scale_x(4), Base::scale_x(4), Base::scale_x(5), Base::scale_x(5)}; | |
| 564 | } | ||
| 565 | 80824 | uint8x16_t gen_vsx_idx_diff_g() { | |
| 566 | 80824 | return uint8x16_t{0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1}; | |
| 567 | } | ||
| 568 | |||
| 569 | 80824 | uint32x4x4_t gen_vsx_b() { | |
| 570 | 161648 | return uint32x4x4_t{ | |
| 571 | 323296 | Base::scale_x(0), Base::scale_x(1), Base::scale_x(1), Base::scale_x(1), | |
| 572 | 80824 | Base::scale_x(2), Base::scale_x(2), Base::scale_x(2), Base::scale_x(3), | |
| 573 | 80824 | Base::scale_x(3), Base::scale_x(3), Base::scale_x(4), Base::scale_x(4), | |
| 574 | 80824 | Base::scale_x(4), Base::scale_x(5), Base::scale_x(5), Base::scale_x(5)}; | |
| 575 | } | ||
| 576 | 80824 | uint8x16_t gen_vsx_idx_diff_b() { | |
| 577 | 80824 | return uint8x16_t{0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2}; | |
| 578 | } | ||
| 579 | |||
| 580 | 242472 | void fill_full_constants_vectorially( | |
| 581 | FullVectorInterpolationConstants &constants, uint32x4x4_t vsx, | ||
| 582 | uint8x16_t vsx_idx_diff, uint64_t sx_fixp, unsigned in_pixel_index) { | ||
| 583 | 484944 | uint64_t src_element_index_base = | |
| 584 | 242472 | ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index; | |
| 585 | 242472 | constants.src_element_index = | |
| 586 | 242472 | static_cast<ptrdiff_t>(src_element_index_base); | |
| 587 | |||
| 588 | // Create x coordinate for all lanes | ||
| 589 | 242472 | uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp & ((1 << kFixpBits) - 1)); | |
| 590 | 242472 | uint32x4_t vfrac = vdupq_n_u32(xfrac0); | |
| 591 | 242472 | uint8x16x2_t vsx_delta_lo, vsx_delta_hi; | |
| 592 | 242472 | vsx_delta_lo.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[0], vfrac)); | |
| 593 | 242472 | vsx_delta_lo.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[1], vfrac)); | |
| 594 | 242472 | vsx_delta_hi.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[2], vfrac)); | |
| 595 | 242472 | vsx_delta_hi.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[3], vfrac)); | |
| 596 | |||
| 597 | // Get index from coordinate | ||
| 598 | 242472 | uint8x8_t idx0 = vqtbl2_u8(vsx_delta_lo, Base::vsidx_tbl_); | |
| 599 | 242472 | uint8x8_t idx1 = vqtbl2_u8(vsx_delta_hi, Base::vsidx_tbl_); | |
| 600 | 242472 | uint8x16_t vsx0_idx = vcombine_u8(idx0, idx1); | |
| 601 | // One step in x means 3 steps in elements | ||
| 602 | 242472 | vsx0_idx = vmulq_u8(vsx0_idx, vdupq_n_u8(3)); | |
| 603 | // Align the stepping if the first lane is green or blue | ||
| 604 | 242472 | vsx0_idx = vqsubq_u8(vsx0_idx, vdupq_n_u8(in_pixel_index)); | |
| 605 | // Add in-pixel index | ||
| 606 | 242472 | vsx0_idx = vaddq_u8(vsx0_idx, vsx_idx_diff); | |
| 607 | 242472 | vst1q(constants.idx, vsx0_idx); | |
| 608 | |||
| 609 | // Get fraction from coordinate | ||
| 610 | 242472 | uint16x8x2_t vsxfrac; | |
| 611 | 242472 | vsxfrac.val[0] = | |
| 612 | 242472 | vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_lo, Base::vsfrac_tbl_)); | |
| 613 | 242472 | vsxfrac.val[1] = | |
| 614 | 242472 | vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_hi, Base::vsfrac_tbl_)); | |
| 615 | 242472 | VecTraits<uint16_t>::store(vsxfrac, constants.xfrac); | |
| 616 | 242472 | } | |
| 617 | |||
| 618 | 136 | void fill_full_constants_scalarly(FullVectorInterpolationConstants &constants, | |
| 619 | unsigned in_pixel_index, | ||
| 620 | uint64_t src_element_index, | ||
| 621 | uint64_t src_element_base, | ||
| 622 | uint64_t sx_fixp) { | ||
| 623 | 136 | constants.src_element_index = static_cast<ptrdiff_t>(src_element_base); | |
| 624 | |||
| 625 | 272 | fill_idx_xfrac(constants, in_pixel_index, src_element_index, | |
| 626 | 136 | src_element_base, sx_fixp); | |
| 627 | 136 | } | |
| 628 | |||
| 629 | 225 | void fill_half_constants_scalarly(HalfVectorInterpolationConstants &constants, | |
| 630 | uint64_t dst_element_index, | ||
| 631 | unsigned in_pixel_index, | ||
| 632 | uint64_t src_element_index, | ||
| 633 | uint64_t src_element_base, | ||
| 634 | uint64_t sx_fixp) { | ||
| 635 | 225 | constants.dst_element_index = static_cast<ptrdiff_t>(dst_element_index); | |
| 636 | 225 | constants.src_element_index = static_cast<ptrdiff_t>(src_element_base); | |
| 637 | |||
| 638 | 450 | fill_idx_xfrac(constants, in_pixel_index, src_element_index, | |
| 639 | 225 | src_element_base, sx_fixp); | |
| 640 | 225 | } | |
| 641 | |||
| 642 | template <typename VectorConstants> | ||
| 643 | 361 | void fill_idx_xfrac(VectorConstants &constants, unsigned in_pixel_index, | |
| 644 | uint64_t src_element_index, uint64_t src_element_base, | ||
| 645 | uint64_t sx_fixp) { | ||
| 646 | // For indexing inside idx and xfrac arrays of | ||
| 647 | // the interpolation constants | ||
| 648 | 361 | unsigned j = 0; | |
| 649 | 361 | uint8_t idx = (src_element_index - src_element_base); | |
| 650 | 361 | uint16_t xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2); | |
| 651 | |||
| 652 |
8/8✓ Branch 0 taken 185 times.
✓ Branch 1 taken 74 times.
✓ Branch 2 taken 163 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 165 times.
✓ Branch 5 taken 62 times.
✓ Branch 6 taken 279 times.
✓ Branch 7 taken 138 times.
|
1153 | for (; j < (kChannels - in_pixel_index); ++j) { |
| 653 | 792 | constants.idx[j] = idx + j; | |
| 654 | 792 | constants.xfrac[j] = xfrac; | |
| 655 | 792 | } | |
| 656 | |||
| 657 | 361 | sx_fixp += sx_fixp_one_dst_pixel_; | |
| 658 | 361 | src_element_index = (sx_fixp >> kFixpBits) * kChannels; | |
| 659 | 361 | idx = (src_element_index - src_element_base); | |
| 660 | 361 | xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2); | |
| 661 | |||
| 662 | 361 | constexpr size_t idx_frac_elem_num = sizeof(VectorConstants::idx); | |
| 663 | |||
| 664 |
8/8✓ Branch 0 taken 370 times.
✓ Branch 1 taken 74 times.
✓ Branch 2 taken 193 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 310 times.
✓ Branch 5 taken 62 times.
✓ Branch 6 taken 300 times.
✓ Branch 7 taken 138 times.
|
1534 | while (j < idx_frac_elem_num) { |
| 665 | // k is the index for the elements in one pixel | ||
| 666 |
16/16✓ Branch 0 taken 74 times.
✓ Branch 1 taken 1295 times.
✓ Branch 2 taken 999 times.
✓ Branch 3 taken 370 times.
✓ Branch 4 taken 87 times.
✓ Branch 5 taken 639 times.
✓ Branch 6 taken 533 times.
✓ Branch 7 taken 193 times.
✓ Branch 8 taken 62 times.
✓ Branch 9 taken 1075 times.
✓ Branch 10 taken 827 times.
✓ Branch 11 taken 310 times.
✓ Branch 12 taken 138 times.
✓ Branch 13 taken 987 times.
✓ Branch 14 taken 825 times.
✓ Branch 15 taken 300 times.
|
4357 | for (unsigned k = 0; (j < idx_frac_elem_num) && (k < kChannels); |
| 667 | 3184 | ++j, ++k) { | |
| 668 | 3184 | constants.idx[j] = idx + k; | |
| 669 | 3184 | constants.xfrac[j] = xfrac; | |
| 670 | 3184 | } | |
| 671 | 1173 | sx_fixp += sx_fixp_one_dst_pixel_; | |
| 672 | 1173 | src_element_index = (sx_fixp >> kFixpBits) * kChannels; | |
| 673 | 1173 | idx = (src_element_index - src_element_base); | |
| 674 | 1173 | xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2); | |
| 675 | } | ||
| 676 | 361 | } | |
| 677 | |||
| 678 | static constexpr size_t kChannels = 3; | ||
| 679 | // Difference in source x coordinate for one destination pixel | ||
| 680 | const size_t sx_fixp_one_dst_pixel_; | ||
| 681 | }; | ||
| 682 | |||
| 683 | template <ptrdiff_t kRatio, ptrdiff_t kChannels, | ||
| 684 | bool kSetRightmostLanes = false> | ||
| 685 | class ResizeGenericU8Operation final { | ||
| 686 | public: | ||
| 687 | 301 | ResizeGenericU8Operation(const uint8_t *src, size_t src_stride, | |
| 688 | size_t src_height, size_t y_begin, size_t y_end, | ||
| 689 | uint8_t *dst, size_t dst_stride, size_t dst_height) | ||
| 690 | 301 | : src_rows_{src, src_stride, kChannels}, | |
| 691 | 301 | dst_rows_{dst, dst_stride, kChannels}, | |
| 692 | 301 | src_height_{src_height}, | |
| 693 | 301 | y_begin_{y_begin}, | |
| 694 | 301 | y_end_{y_end}, | |
| 695 | 301 | dst_height_{dst_height} {} | |
| 696 | |||
| 697 | 301 | void process_rows(RowInterpolationConstants &row_interpolation_constants) { | |
| 698 |
14/16✓ Branch 0 taken 48 times.
✓ Branch 1 taken 448 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 688 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 43 times.
✓ Branch 7 taken 643 times.
✓ Branch 8 taken 48 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 54 times.
✓ Branch 11 taken 572 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 13 times.
✓ Branch 14 taken 46 times.
✓ Branch 15 taken 536 times.
|
3533 | for (uint64_t dst_y = y_begin_; dst_y < y_end_; ++dst_y) { |
| 699 | 3232 | process_row(dst_y, row_interpolation_constants); | |
| 700 | 3232 | } | |
| 701 | 301 | } | |
| 702 | |||
| 703 | private: | ||
| 704 | 3232 | uint64_t to_src_y(uint64_t dy) const { | |
| 705 | 3232 | return aligned_scale(dy, src_height_, dst_height_); | |
| 706 | } | ||
| 707 | |||
| 708 | 3232 | void process_row(uint64_t dy, | |
| 709 | RowInterpolationConstants &row_interpolation_constants) { | ||
| 710 | 3232 | VectorPathNums num_of_vector_paths = | |
| 711 | 3232 | row_interpolation_constants.num_of_vector_paths(); | |
| 712 | 6464 | auto *full_array = | |
| 713 | 3232 | row_interpolation_constants.full_vector_constants_array(); | |
| 714 | 6464 | auto *half_array = | |
| 715 | 3232 | row_interpolation_constants.half_vector_constants_array(); | |
| 716 | |||
| 717 | 3232 | uint64_t sy_fixp = to_src_y(dy); | |
| 718 | 3232 | ptrdiff_t sy = static_cast<ptrdiff_t>(sy_fixp >> kFixpBits); | |
| 719 | 3232 | const uint8_t *src_top = &src_rows_.at(sy)[0]; | |
| 720 | 3232 | const uint8_t *src_bottom = &src_rows_.at(sy + 1)[0]; | |
| 721 | 3232 | uint8_t *dst = &dst_rows_.at(static_cast<ptrdiff_t>(dy))[0]; | |
| 722 | // Get the highest 8 bits of the fractional part | ||
| 723 | // This is a good compromise between accuracy and performance | ||
| 724 | // Because the result is 8bits, the error only affects the least | ||
| 725 | // significant 1-2 bits, see the accuracy calculation in kleidicv.h | ||
| 726 | 6464 | uint16_t yfrac = | |
| 727 | 3232 | static_cast<uint16_t>((sy_fixp - (sy << kFixpBits)) >> (kFixpBits - 8)); | |
| 728 | |||
| 729 | 3232 | ptrdiff_t dst_element_index = 0; | |
| 730 | |||
| 731 |
14/16✓ Branch 0 taken 21912 times.
✓ Branch 1 taken 448 times.
✓ Branch 2 taken 45764 times.
✓ Branch 3 taken 688 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 68368 times.
✓ Branch 7 taken 643 times.
✓ Branch 8 taken 20864 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 44414 times.
✓ Branch 11 taken 572 times.
✓ Branch 12 taken 1068 times.
✓ Branch 13 taken 13 times.
✓ Branch 14 taken 67072 times.
✓ Branch 15 taken 536 times.
|
272694 | for (size_t i = 0; i < num_of_vector_paths.two_x; i += 1) { |
| 732 | 269462 | uint8x16x2_t res{}; | |
| 733 | 269462 | res.val[0] = vector_path(full_array[i * 2], src_top, src_bottom, yfrac); | |
| 734 | 269462 | res.val[1] = | |
| 735 | 269462 | vector_path(full_array[(i * 2) + 1], src_top, src_bottom, yfrac); | |
| 736 | 269462 | VecTraits<uint8_t>::store(res, &dst[dst_element_index]); | |
| 737 | 269462 | dst_element_index += kStep * 2; | |
| 738 | 269462 | } | |
| 739 | |||
| 740 |
14/16✓ Branch 0 taken 448 times.
✓ Branch 1 taken 1086 times.
✓ Branch 2 taken 688 times.
✓ Branch 3 taken 1642 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 643 times.
✓ Branch 7 taken 1505 times.
✓ Branch 8 taken 332 times.
✓ Branch 9 taken 880 times.
✓ Branch 10 taken 572 times.
✓ Branch 11 taken 1494 times.
✓ Branch 12 taken 13 times.
✓ Branch 13 taken 39 times.
✓ Branch 14 taken 536 times.
✓ Branch 15 taken 1620 times.
|
11498 | for (size_t i = 0; i < num_of_vector_paths.half; i += 1) { |
| 741 | 8266 | auto res = vector_path_half(half_array[i], yfrac, src_top, src_bottom); | |
| 742 | 8266 | vst1(&dst[half_array[i].dst_element_index], res); | |
| 743 | 8266 | } | |
| 744 | 3232 | } | |
| 745 | |||
| 746 | 8266 | uint8x8_t vector_path_half(const HalfVectorInterpolationConstants &constants, | |
| 747 | uint16_t yfrac, const uint8_t *src_top, | ||
| 748 | const uint8_t *src_bottom) const { | ||
| 749 | 8266 | uint8x8_t vsx0_idx = vld1_u8(constants.idx); | |
| 750 | 8266 | uint8x8_t vsx1_idx = vadd_u8(vsx0_idx, vdup_n_u8(kChannels)); | |
| 751 | 8266 | uint16x8_t vsxfrac; | |
| 752 | 8266 | VecTraits<uint16_t>::load(constants.xfrac, vsxfrac); | |
| 753 | 8266 | ptrdiff_t src_element_index = constants.src_element_index; | |
| 754 | |||
| 755 | using SrcVecType = std::conditional_t<kRatio == 2 && kChannels != 3, | ||
| 756 | uint8x16_t, uint8x16x2_t>; | ||
| 757 | 8266 | SrcVecType topsrc, bottomsrc; | |
| 758 | 8266 | VecTraits<uint8_t>::load(&src_top[src_element_index], topsrc); | |
| 759 | 8266 | VecTraits<uint8_t>::load(&src_bottom[src_element_index], bottomsrc); | |
| 760 | |||
| 761 | 8266 | uint8x8_t a, b, c, d; | |
| 762 | if constexpr (kRatio == 2 && kChannels != 3) { | ||
| 763 | 2728 | a = vqtbl1_u8(topsrc, vsx0_idx); | |
| 764 | 2728 | b = vqtbl1_u8(topsrc, vsx1_idx); | |
| 765 | 2728 | c = vqtbl1_u8(bottomsrc, vsx0_idx); | |
| 766 | 2728 | d = vqtbl1_u8(bottomsrc, vsx1_idx); | |
| 767 | } else if constexpr (kRatio == 3 || kChannels == 3) { | ||
| 768 | 5538 | a = vqtbl2_u8(topsrc, vsx0_idx); | |
| 769 | 5538 | b = vqtbl2_u8(topsrc, vsx1_idx); | |
| 770 | 5538 | c = vqtbl2_u8(bottomsrc, vsx0_idx); | |
| 771 | 5538 | d = vqtbl2_u8(bottomsrc, vsx1_idx); | |
| 772 | } | ||
| 773 | 16532 | uint8x8_t left = | |
| 774 | 8266 | vraddhn_u16(vshll_n_u8(a, 8), vmulq_n_u16(vsubl_u8(c, a), yfrac)); | |
| 775 | 16532 | uint8x8_t right = | |
| 776 | 8266 | vraddhn_u16(vshll_n_u8(b, 8), vmulq_n_u16(vsubl_u8(d, b), yfrac)); | |
| 777 | 16532 | uint8x8_t res = vraddhn_u16(vshll_n_u8(left, 8), | |
| 778 | 8266 | vmulq_u16(vsubl_u8(right, left), vsxfrac)); | |
| 779 | 16532 | return res; | |
| 780 | 8266 | } | |
| 781 | |||
| 782 | 538924 | uint8x16_t vector_path(const FullVectorInterpolationConstants &constants, | |
| 783 | const uint8_t *src_top, const uint8_t *src_bottom, | ||
| 784 | uint16_t yfrac) const { | ||
| 785 | 538924 | uint8x16_t vsx0_idx = vld1q(constants.idx); | |
| 786 | 538924 | uint8x16_t vsx1_idx = vaddq_u8(vsx0_idx, vdupq_n_u8(kChannels)); | |
| 787 | 538924 | uint16x8x2_t vsxfrac2; | |
| 788 | 538924 | VecTraits<uint16_t>::load(constants.xfrac, vsxfrac2); | |
| 789 | 538924 | ptrdiff_t src_element_index = constants.src_element_index; | |
| 790 | |||
| 791 | using SrcVecType = | ||
| 792 | std::conditional_t<kRatio == 2, uint8x16x2_t, uint8x16x3_t>; | ||
| 793 | 538924 | SrcVecType topsrc, bottomsrc; | |
| 794 | 538924 | VecTraits<uint8_t>::load(&src_top[src_element_index], topsrc); | |
| 795 | 538924 | VecTraits<uint8_t>::load(&src_bottom[src_element_index], bottomsrc); | |
| 796 | 538924 | uint8x16_t a, b, c, d; | |
| 797 | if constexpr (kRatio == 2) { | ||
| 798 | 272088 | a = vqtbl2q_u8(topsrc, vsx0_idx); | |
| 799 | 272088 | b = vqtbl2q_u8(topsrc, vsx1_idx); | |
| 800 | 272088 | c = vqtbl2q_u8(bottomsrc, vsx0_idx); | |
| 801 | 272088 | d = vqtbl2q_u8(bottomsrc, vsx1_idx); | |
| 802 | if constexpr (kSetRightmostLanes) { | ||
| 803 | // table lookup would overindex topsrc and bottomsrc | ||
| 804 | ✗ | ptrdiff_t last_but_one_right_elem_idx = | |
| 805 | ✗ | src_element_index + constants.idx[14] + kChannels; | |
| 806 | ✗ | ptrdiff_t last_right_elem_idx = | |
| 807 | ✗ | src_element_index + constants.idx[15] + kChannels; | |
| 808 | ✗ | b = vsetq_lane_u8(src_top[last_but_one_right_elem_idx], b, 14); | |
| 809 | ✗ | b = vsetq_lane_u8(src_top[last_right_elem_idx], b, 15); | |
| 810 | ✗ | d = vsetq_lane_u8(src_bottom[last_but_one_right_elem_idx], d, 14); | |
| 811 | ✗ | d = vsetq_lane_u8(src_bottom[last_right_elem_idx], d, 15); | |
| 812 | } | ||
| 813 | } else if constexpr (kRatio == 3) { | ||
| 814 | 266836 | a = vqtbl3q_u8(topsrc, vsx0_idx); | |
| 815 | 266836 | b = vqtbl3q_u8(topsrc, vsx1_idx); | |
| 816 | 266836 | c = vqtbl3q_u8(bottomsrc, vsx0_idx); | |
| 817 | 266836 | d = vqtbl3q_u8(bottomsrc, vsx1_idx); | |
| 818 | // table lookup would overindex topsrc and bottomsrc | ||
| 819 | if constexpr (kSetRightmostLanes) { | ||
| 820 | 4272 | ptrdiff_t last_right_elem_idx = | |
| 821 | 2136 | src_element_index + constants.idx[15] + kChannels; | |
| 822 | 2136 | b = vsetq_lane_u8(src_top[last_right_elem_idx], b, 15); | |
| 823 | 2136 | d = vsetq_lane_u8(src_bottom[last_right_elem_idx], d, 15); | |
| 824 | 2136 | } | |
| 825 | } | ||
| 826 | 538924 | uint8x8_t left_lo = lerp_low_half(a, c, yfrac); | |
| 827 | 538924 | uint8x8_t left_hi = lerp_high_half(a, c, yfrac); | |
| 828 | 538924 | uint8x8_t right_lo = lerp_low_half(b, d, yfrac); | |
| 829 | 538924 | uint8x8_t right_hi = lerp_high_half(b, d, yfrac); | |
| 830 | 538924 | uint8x8_t res_lo = lerp_full(left_lo, right_lo, vsxfrac2.val[0]); | |
| 831 | 538924 | uint8x8_t res_hi = lerp_full(left_hi, right_hi, vsxfrac2.val[1]); | |
| 832 | 1077848 | return vcombine_u8(res_lo, res_hi); | |
| 833 | 538924 | } | |
| 834 | |||
| 835 | 1077848 | static uint8x8_t lerp_low_half(uint8x16_t a, uint8x16_t b, uint16_t w) { | |
| 836 | 1077848 | return vraddhn_u16( | |
| 837 | 1077848 | vshll_n_u8(vget_low_u8(a), 8), | |
| 838 | 1077848 | vmulq_n_u16(vsubl_u8(vget_low_u8(b), vget_low_u8(a)), w)); | |
| 839 | } | ||
| 840 | |||
| 841 | 1077848 | static uint8x8_t lerp_high_half(uint8x16_t a, uint8x16_t b, uint16_t w) { | |
| 842 | 2155696 | return vraddhn_u16(vshll_high_n_u8(a, 8), | |
| 843 | 1077848 | vmulq_n_u16(vsubl_high_u8(b, a), w)); | |
| 844 | } | ||
| 845 | |||
| 846 | 1077848 | static uint8x8_t lerp_full(uint8x8_t a, uint8x8_t b, uint16x8_t w) { | |
| 847 | 1077848 | return vraddhn_u16(vshll_n_u8(a, 8), vmulq_u16(vsubl_u8(b, a), w)); | |
| 848 | } | ||
| 849 | |||
| 850 | const Rows<const uint8_t> src_rows_; | ||
| 851 | const Rows<uint8_t> dst_rows_; | ||
| 852 | const size_t src_height_; | ||
| 853 | const size_t y_begin_; | ||
| 854 | const size_t y_end_; | ||
| 855 | const size_t dst_height_; | ||
| 856 | }; | ||
| 857 | |||
| 858 | } // namespace kleidicv::neon::resize_linear_generic_u8 | ||
| 859 |