KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/resize/resize_linear_generic_u8_neon.h
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 478 489 97.8%
Functions: 144 153 94.1%
Branches: 151 274 55.1%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <algorithm>
6 #include <cstddef>
7 #include <cstdint>
8 #include <cstdlib>
9 #include <memory>
10 #include <utility>
11 #include <variant>
12
13 #include "kleidicv/ctypes.h"
14 #include "kleidicv/neon.h"
15 #include "kleidicv/utils.h"
16
17 namespace kleidicv::neon::resize_linear_generic_u8 {
18
19 //------------------------------------------------------
20 /// Generic resize for ratios 1/3 to 1/1, u8
21 //------------------------------------------------------
22
23 // For the coordinate calculation, fixed-point format is used, for better
24 // performance. Fixed-point format:
25 // - lowest 16 bits are the fractional part, that is the kFixpBits constant
26 // - at interpolation, the high 8 bits are used from the fractional part
27 // (this is a good compromise between accuracy and performance: because the
28 // result is 8bits, the error only affects the least significant 1-2 bits, see
29 // the accuracy calculation in kleidicv.h
30 // - to get the integer part, right shift by 16 bits, or zip/unzip/tbl etc. to
31 // get the bytes needed
32 // - for better accuracy, rounding is needed everywhere, i.e. adding 0.5, which
33 // is 1 << 15
34
35 static constexpr ptrdiff_t kFixpBits = 16;
36 static constexpr ptrdiff_t kFixpHalf = (1UL << (kFixpBits - 1));
37 static constexpr ptrdiff_t kStep = kVectorLength / sizeof(uint8_t);
38 static constexpr ptrdiff_t kHalfStep = kStep / 2;
39
40 struct FullVectorInterpolationConstants {
41 uint8_t idx[kStep];
42 uint16_t xfrac[kStep];
43 ptrdiff_t src_element_index;
44 };
45
46 struct HalfVectorInterpolationConstants {
47 uint8_t idx[kHalfStep];
48 uint16_t xfrac[kHalfStep];
49 ptrdiff_t src_element_index;
50 ptrdiff_t dst_element_index;
51 };
52
53 struct VectorPathNums {
54 size_t two_x;
55 size_t half;
56
57 313 explicit VectorPathNums(std::pair<size_t, size_t> sizes)
58 313 : two_x{sizes.first}, half{sizes.second} {}
59 };
60
61 template <typename T = uint64_t>
62 4050324 static T rounding_div(uint64_t nom, uint64_t denom) {
63 4050324 return static_cast<T>((nom + denom / 2) / denom);
64 }
65
66 // Scale coordinate using this formula, so the center is aligned:
67 // source_x = (destination_x + 0.5) / scale - 0.5;
68 // plus 1/256/2 for later rounding the fractional part to 8bits
69 5019 static inline uint64_t aligned_scale(uint64_t x, uint64_t nom, uint64_t denom) {
70 5019 return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) - kFixpHalf +
71 (1 << (kFixpBits - 9));
72 }
73
74 class RowInterpolationConstants {
75 public:
76 // Constructible only through create
77 RowInterpolationConstants() = delete;
78
79 313 static std::variant<RowInterpolationConstants, kleidicv_error_t> create(
80 VectorPathNums num_of_vector_paths) {
81 {
82 626 uint8_t *allocation = static_cast<uint8_t *>(malloc(
83 313 num_of_vector_paths.two_x * 2 *
84 313 sizeof(FullVectorInterpolationConstants) +
85 313 num_of_vector_paths.half * sizeof(HalfVectorInterpolationConstants)));
86
2/2
✓ Branch 0 taken 301 times.
✓ Branch 1 taken 12 times.
313 if (!allocation) {
87 12 return KLEIDICV_ERROR_ALLOCATION;
88 }
89
90 301 return RowInterpolationConstants{num_of_vector_paths, allocation};
91 313 }
92 return KLEIDICV_OK;
93 313 }
94
95 125003 VectorPathNums num_of_vector_paths() const { return num_of_vector_paths_; }
96
97 486760 FullVectorInterpolationConstants *full_vector_constants_array() const {
98 486760 return full_vector_constants_array_;
99 }
100
101 3941 HalfVectorInterpolationConstants *half_vector_constants_array() const {
102 3941 return half_vector_constants_array_;
103 }
104
105 private:
106 301 RowInterpolationConstants(VectorPathNums num_of_vector_paths, uint8_t *buffer)
107 301 : buffer_{buffer, &std::free},
108 602 full_vector_constants_array_{
109 301 reinterpret_cast<FullVectorInterpolationConstants *>(buffer)},
110 602 half_vector_constants_array_{
111 reinterpret_cast<HalfVectorInterpolationConstants *>(
112 602 full_vector_constants_array_ +
113 301 (num_of_vector_paths.two_x * 2))},
114 301 num_of_vector_paths_{num_of_vector_paths} {}
115
116 using FreeDeleter = decltype(&std::free);
117 std::unique_ptr<uint8_t, FreeDeleter> buffer_;
118 FullVectorInterpolationConstants *const full_vector_constants_array_;
119 HalfVectorInterpolationConstants *const half_vector_constants_array_;
120 const VectorPathNums num_of_vector_paths_;
121 };
122
123 template <ptrdiff_t kRatio, ptrdiff_t kChannels>
124 class RowInterpolationConstantsGeneratorBase {
125 protected:
126 313 RowInterpolationConstantsGeneratorBase(size_t src_width, size_t dst_width)
127 313 : src_width_{src_width},
128 313 dst_width_{dst_width},
129 313 vsidx_tbl_{2, 6, 10, 14, 18, 22, 26, 30},
130 313 vsfrac_tbl_{1, 255, 5, 255, 9, 255, 13, 255,
131 313 17, 255, 21, 255, 25, 255, 29, 255} {}
132
133 313 std::pair<size_t, size_t> calculate_num_of_vector_paths() {
134
8/12
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 56 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 45 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 30 times.
✓ Branch 7 taken 20 times.
✓ Branch 8 taken 56 times.
✗ Branch 9 not taken.
✓ Branch 10 taken 56 times.
✗ Branch 11 not taken.
313 size_t two_x = ((src_width_ * kChannels) >= (sizeof(uint8x16_t) * kRatio))
135 273 ? ((dst_width_ * kChannels) / (2 * kStep))
136 : 0;
137
138 626 size_t remaining_dx_after_2x_cycle =
139 313 (dst_width_ * kChannels) - (two_x * 2 * kStep);
140 313 size_t half = align_up(remaining_dx_after_2x_cycle, kHalfStep) / kHalfStep;
141 313 return {two_x, half};
142 313 }
143
144 // Scale destination x coordinate to source x coordinate, into fixed-point,
145 // without center correction
146 3882944 uint32_t scale_x(uint64_t dx) const {
147 3882944 return rounding_div<uint32_t>(((dx * src_width_) << kFixpBits), dst_width_);
148 }
149
150 1787 uint64_t to_src_x(uint64_t dx) const {
151 1787 return aligned_scale(dx, src_width_, dst_width_);
152 }
153
154 const size_t src_width_;
155 const size_t dst_width_;
156 const uint8x8_t vsidx_tbl_;
157 const uint8x16_t vsfrac_tbl_;
158 };
159
160 template <ptrdiff_t kRatio, ptrdiff_t kChannels>
161 class RowInterpolationConstantsGenerator final
162 : RowInterpolationConstantsGeneratorBase<kRatio, kChannels> {
163 public:
164 using Base = RowInterpolationConstantsGeneratorBase<kRatio, kChannels>;
165 212 RowInterpolationConstantsGenerator(size_t src_width, size_t dst_width)
166 212 : Base{src_width, dst_width},
167 // These starting values are not aligned to center. The center alignment
168 // must be added only once. When added to a center-aligned source_x
169 // value, the result will be center-aligned.
170 636 vsx0_0_{Base::scale_x(0), Base::scale_x(1 / kChannels),
171 424 Base::scale_x(2 / kChannels), Base::scale_x(3 / kChannels)},
172 636 vsx0_1_{Base::scale_x(4 / kChannels), Base::scale_x(5 / kChannels),
173 424 Base::scale_x(6 / kChannels), Base::scale_x(7 / kChannels)},
174 636 vsx0_2_{Base::scale_x(8 / kChannels), Base::scale_x(9 / kChannels),
175 424 Base::scale_x(10 / kChannels), Base::scale_x(11 / kChannels)},
176 636 vsx0_3_{Base::scale_x(12 / kChannels), Base::scale_x(13 / kChannels),
177 636 Base::scale_x(14 / kChannels), Base::scale_x(15 / kChannels)} {}
178
179 212 std::variant<RowInterpolationConstants, kleidicv_error_t> operator()() {
180 212 VectorPathNums v{Base::calculate_num_of_vector_paths()};
181 212 auto row_interpolation_constants_variant =
182 212 RowInterpolationConstants::create(v);
183
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 54 times.
212 if (std::holds_alternative<kleidicv_error_t>(
184 row_interpolation_constants_variant)) {
185 // Creation failed with some error, return with the variant as it is
186 8 return row_interpolation_constants_variant;
187 }
188 204 auto &row_interpolation_constants = *std::get_if<RowInterpolationConstants>(
189 &row_interpolation_constants_variant);
190
191 204 uint64_t dx = 0;
192 204 uint64_t sx_fixp = 0;
193
194 // Calculate constants for full vectors
195
196 // Maximum source coordinate for vector path 2x
197 408 const uint64_t max_sx_2x =
198
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
612 std::max(Base::src_width_ * kChannels - (sizeof(uint8x16_t) * kRatio),
199 408 0UL) /
200 kChannels;
201 // Difference in source x coordinate for one vector path
202 408 const uint64_t sx_fixp_vector_step = rounding_div(
203 204 (Base::src_width_ * kStep / kChannels) << kFixpBits, Base::dst_width_);
204
205
8/8
✓ Branch 0 taken 19838 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 39766 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 20242 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 40614 times.
✓ Branch 7 taken 54 times.
120664 for (size_t i = 0;
206
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
120664 i < row_interpolation_constants.num_of_vector_paths().two_x; ++i) {
207 // Repeatedly adding sx_fixp_vector_step is faster than scaling dx to sx,
208 // but it accumulates fixed-point error; periodic recalibration resets it.
209 // The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1 <<
210 // 16). Only the upper 8 bits of the 16-bit fractional part are used for
211 // interpolation, so once the accumulated error reaches 1 / (1 << 8), it
212 // can affect later stages. This corresponds to 512 additions. Since two
213 // additions are performed per cycle, we recalibrate every 256 cycles,
214 // calculated by this mask.
215 120460 constexpr uint64_t kRecalibrateCycleMask = ((1 << 8) - 1);
216
8/8
✓ Branch 0 taken 19742 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 39570 times.
✓ Branch 3 taken 196 times.
✓ Branch 4 taken 20144 times.
✓ Branch 5 taken 98 times.
✓ Branch 6 taken 40414 times.
✓ Branch 7 taken 200 times.
120460 if ((i & kRecalibrateCycleMask) == 0) {
217
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
590 sx_fixp = Base::to_src_x(dx);
218 590 }
219
220 // Pull back sx if it would overrun
221 120460 uint64_t sx_candidate = sx_fixp >> kFixpBits;
222 120460 uint64_t sx_base = std::min(max_sx_2x, sx_candidate);
223
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
120460 calculate_indices_fractions_base_2x(
224
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
120460 row_interpolation_constants.full_vector_constants_array()[i * 2],
225 120460 sx_base, sx_fixp);
226 120460 sx_fixp += sx_fixp_vector_step;
227 120460 dx += kStep / kChannels;
228
229 // Pull back sx if it would overrun
230 120460 sx_candidate = sx_fixp >> kFixpBits;
231 120460 sx_base = std::min(max_sx_2x, sx_candidate);
232
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
120460 calculate_indices_fractions_base_2x(
233 120460 row_interpolation_constants
234
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
120460 .full_vector_constants_array()[(i * 2) + 1],
235 120460 sx_base, sx_fixp);
236 120460 sx_fixp += sx_fixp_vector_step;
237 120460 dx += kStep / kChannels;
238 120460 }
239
240 // Calculate constants for half vectors
241
242
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
204 sx_fixp = Base::to_src_x(dx);
243
244 // Difference in source x coordinate for one destination pixel
245 408 const uint64_t sx_fixp_one_dst_pixel =
246 204 rounding_div(Base::src_width_ << kFixpBits, Base::dst_width_);
247 // Maximum source coordinate for half vector path
248 408 const uint64_t max_sx_half =
249
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
612 std::max(Base::src_width_ * kChannels -
250
251 (sizeof(uint8x16_t) * (kRatio - 1)),
252 408 0UL) /
253 kChannels;
254 // Maximum destination coordinate for half vector path
255 204 const uint64_t max_dx_half = Base::dst_width_ - (kHalfStep / kChannels);
256 // Difference in source x coordinate for the half vector path
257 408 const uint64_t sx_fixp_half_step =
258 408 rounding_div((Base::src_width_ * kHalfStep / kChannels) << kFixpBits,
259 204 Base::dst_width_);
260
261
8/8
✓ Branch 0 taken 94 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 118 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 112 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 160 times.
✓ Branch 7 taken 54 times.
688 for (size_t i = 0;
262
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
688 i < row_interpolation_constants.num_of_vector_paths().half; ++i) {
263 // If (dx + half vector length) would overrun the buffer, pull it back
264 484 uint64_t dx_pulled_back = std::min(dx, max_dx_half);
265 // Pull back sx if dx was pulled back
266 484 sx_fixp -= (dx - dx_pulled_back) * sx_fixp_one_dst_pixel;
267 484 dx = dx_pulled_back;
268 // If (sx_base + reading length) would overrun the buffer, pull sx back
269 // again
270 484 uint64_t sx_candidate = sx_fixp >> kFixpBits;
271 484 uint64_t sx_base = std::min(max_sx_half, sx_candidate);
272
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
484 calculate_indices_fractions_base_half(
273
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
484 row_interpolation_constants.half_vector_constants_array()[i], sx_base,
274 484 sx_fixp, dx);
275
276 484 dx += kHalfStep / kChannels;
277 484 sx_fixp += sx_fixp_half_step;
278 484 }
279
280 204 return row_interpolation_constants_variant;
281 212 }
282
283 private:
284 240920 void calculate_indices_fractions_base_2x(
285 FullVectorInterpolationConstants &constants, uint64_t sx_base,
286 uint64_t sx_fixp) {
287 240920 uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp - (sx_base << kFixpBits));
288 240920 uint32x4_t vfrac = vdupq_n_u32(xfrac0);
289 // Calculate x coordinate delta from sx_base, the integer part of source x
290 240920 uint8x16x2_t vsx_delta_lo, vsx_delta_hi;
291 240920 vsx_delta_lo.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_0_, vfrac));
292 240920 vsx_delta_lo.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_1_, vfrac));
293 240920 vsx_delta_hi.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_2_, vfrac));
294 240920 vsx_delta_hi.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_3_, vfrac));
295 240920 uint8x8_t idx0 = vqtbl2_u8(vsx_delta_lo, Base::vsidx_tbl_);
296 240920 uint8x8_t idx1 = vqtbl2_u8(vsx_delta_hi, Base::vsidx_tbl_);
297 240920 uint8x16_t vsx0_idx = vcombine_u8(idx0, idx1);
298 if constexpr (kChannels > 1) {
299 160760 vsx0_idx = vshlq_n_u8(vsx0_idx, kChannels == 4 ? 2 : 1);
300 160760 vsx0_idx =
301 160760 vaddq_u8(vsx0_idx, vreinterpretq_u8_u32(vdupq_n_u32(
302 kChannels == 4 ? 0x03020100U : 0x01000100)));
303 }
304 240920 vst1q(constants.idx, vsx0_idx);
305 240920 uint16x8x2_t vsxfrac;
306 240920 vsxfrac.val[0] =
307 240920 vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_lo, Base::vsfrac_tbl_));
308 240920 vsxfrac.val[1] =
309 240920 vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_hi, Base::vsfrac_tbl_));
310 240920 VecTraits<uint16_t>::store(vsxfrac, constants.xfrac);
311 240920 constants.src_element_index = static_cast<ptrdiff_t>(sx_base * kChannels);
312 240920 }
313
314 484 void calculate_indices_fractions_base_half(
315 HalfVectorInterpolationConstants &constants, uint64_t sx_base,
316 uint64_t sx_fixp, uint64_t dx) {
317 484 uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp - (sx_base << kFixpBits));
318 484 uint32x4_t vfrac = vdupq_n_u32(xfrac0);
319 484 uint8x16x2_t vsx_delta;
320 484 vsx_delta.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx0_0_, vfrac));
321 484 vsx_delta.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx0_1_, vfrac));
322 484 uint8x8_t vsx0_idx = vqtbl2_u8(vsx_delta, Base::vsidx_tbl_);
323 if constexpr (kChannels > 1) {
324 278 vsx0_idx = vshl_n_u8(vsx0_idx, kChannels == 4 ? 2 : 1);
325 278 vsx0_idx = vadd_u8(
326 556 vsx0_idx, vreinterpret_u8_u32(
327 278 vdup_n_u32(kChannels == 4 ? 0x03020100U : 0x01000100)));
328 }
329 484 vst1(constants.idx, vsx0_idx);
330 968 uint16x8_t vsxfrac =
331 484 vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta, Base::vsfrac_tbl_));
332 484 VecTraits<uint16_t>::store(vsxfrac, constants.xfrac);
333 484 constants.src_element_index = static_cast<ptrdiff_t>(sx_base * kChannels);
334 484 constants.dst_element_index = static_cast<ptrdiff_t>(dx * kChannels);
335 484 }
336
337 const uint32x4_t vsx0_0_;
338 const uint32x4_t vsx0_1_;
339 const uint32x4_t vsx0_2_;
340 const uint32x4_t vsx0_3_;
341 };
342
343 template <ptrdiff_t kRatio>
344 class RowInterpolationConstantsGenerator<kRatio, 3> final
345 : RowInterpolationConstantsGeneratorBase<kRatio, 3> {
346 public:
347 using Base = RowInterpolationConstantsGeneratorBase<kRatio, 3>;
348 101 RowInterpolationConstantsGenerator(size_t src_width, size_t dst_width)
349 101 : Base{src_width, dst_width},
350 202 sx_fixp_one_dst_pixel_{
351 202 rounding_div(src_width << kFixpBits, dst_width)} {}
352
353 101 std::variant<RowInterpolationConstants, kleidicv_error_t> operator()() {
354 101 VectorPathNums v{Base::calculate_num_of_vector_paths()};
355 101 auto row_interpolation_constants_variant =
356 101 RowInterpolationConstants::create(v);
357
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.
101 if (std::holds_alternative<kleidicv_error_t>(
358 row_interpolation_constants_variant)) {
359 // Creation failed with some error, return with the variant as it is
360 4 return row_interpolation_constants_variant;
361 }
362 97 auto &row_interpolation_constants = *std::get_if<RowInterpolationConstants>(
363 &row_interpolation_constants_variant);
364
365 97 uint64_t dst_element_index = 0;
366 97 uint64_t sx_fixp{};
367
368 // Calculate constants for full vectors
369
370 194 size_t num_of_full_vector_constants =
371 97 row_interpolation_constants.num_of_vector_paths().two_x * 2;
372
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 43 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.
97 if (num_of_full_vector_constants > 0) {
373 97 size_t handled_full_vector_paths = 0;
374
375
3/4
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 29 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 54 times.
97 if (num_of_full_vector_constants > 3) {
376 166 size_t num_of_vector_paths_wout_pullback =
377
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
83 get_num_of_vector_paths_wout_pullback(num_of_full_vector_constants);
378 // Handle 3 vectors at a time, that way in pixel index is known at
379 // compile time
380 166 size_t vector_path_triplets_wout_pullback =
381 83 num_of_vector_paths_wout_pullback / 3;
382
383
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
83 sx_fixp = Base::to_src_x(0);
384 83 unsigned recalibrate_cnt = 0;
385
4/4
✓ Branch 0 taken 39686 times.
✓ Branch 1 taken 29 times.
✓ Branch 2 taken 41138 times.
✓ Branch 3 taken 54 times.
80907 for (size_t i = 0; i < vector_path_triplets_wout_pullback; ++i) {
386
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 const uint32x4x4_t vsx_r = gen_vsx_r();
387
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 const uint8x16_t vsx_idx_diff_r = gen_vsx_idx_diff_r();
388
389
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 const uint32x4x4_t vsx_g = gen_vsx_g();
390
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 const uint8x16_t vsx_idx_diff_g = gen_vsx_idx_diff_g();
391
392
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 const uint32x4x4_t vsx_b = gen_vsx_b();
393
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 const uint8x16_t vsx_idx_diff_b = gen_vsx_idx_diff_b();
394
395 // Difference in source x coordinate for 5 destination pixels
396 161648 const uint64_t sx_fixp_five_dst_pixel = rounding_div(
397 80824 (Base::src_width_ * 5) << kFixpBits, Base::dst_width_);
398 // Difference in source x coordinate for 6 destination pixels
399 161648 const uint64_t sx_fixp_six_dst_pixel = rounding_div(
400 80824 (Base::src_width_ * 6) << kFixpBits, Base::dst_width_);
401
402 // Repeatedly adding sx_fixp_five_dst_pixel and sx_fixp_six_dst_pixel
403 // is faster than scaling dx to sx, but it accumulates fixed-point
404 // error; periodic recalibration resets it. The maximum per-addition
405 // error of these values is 0.5 / (1 << 16). Only the upper 8
406 // bits of the 16-bit fractional part are used for interpolation, so
407 // once the accumulated error reaches 1 / (1 << 8), it can affect
408 // later stages. This corresponds to 512 additions. Since three
409 // additions are performed per cycle, we recalibrate every 170 cycles.
410
4/4
✓ Branch 0 taken 228 times.
✓ Branch 1 taken 39458 times.
✓ Branch 2 taken 234 times.
✓ Branch 3 taken 40904 times.
80824 if (recalibrate_cnt == 170) {
411
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
462 sx_fixp = Base::to_src_x(dst_element_index / 3);
412 462 recalibrate_cnt = 0;
413 462 } else {
414 80362 recalibrate_cnt++;
415 }
416
417 80824 unsigned in_pixel_index = 0;
418
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 fill_full_constants_vectorially(
419 80824 row_interpolation_constants
420 80824 .full_vector_constants_array()[handled_full_vector_paths],
421 80824 vsx_r, vsx_idx_diff_r, sx_fixp, in_pixel_index);
422
423 80824 sx_fixp += sx_fixp_five_dst_pixel;
424 80824 in_pixel_index = 1;
425
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 fill_full_constants_vectorially(
426 80824 row_interpolation_constants
427 80824 .full_vector_constants_array()[handled_full_vector_paths + 1],
428 80824 vsx_g, vsx_idx_diff_g, sx_fixp, in_pixel_index);
429
430 80824 sx_fixp += sx_fixp_five_dst_pixel;
431 80824 in_pixel_index = 2;
432
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
80824 fill_full_constants_vectorially(
433 80824 row_interpolation_constants
434 80824 .full_vector_constants_array()[handled_full_vector_paths + 2],
435 80824 vsx_b, vsx_idx_diff_b, sx_fixp, in_pixel_index);
436
437 80824 sx_fixp += sx_fixp_six_dst_pixel;
438 80824 handled_full_vector_paths += 3;
439 80824 dst_element_index += kStep * 3;
440 80824 }
441 83 }
442
443
4/4
✓ Branch 0 taken 74 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 62 times.
✓ Branch 3 taken 54 times.
233 while (handled_full_vector_paths < num_of_full_vector_constants) {
444 272 auto &constants =
445 136 row_interpolation_constants
446 136 .full_vector_constants_array()[handled_full_vector_paths];
447 // Maximum source coordinate for full vector path
448 272 const uint64_t max_src_base_index = std::max(
449 136 (Base::src_width_ * kChannels) - (sizeof(uint8x16_t) * kRatio),
450 136 0UL);
451
452 136 uint64_t dx = dst_element_index / kChannels;
453 136 unsigned in_pixel_index = dst_element_index % kChannels;
454
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
136 sx_fixp = Base::to_src_x(dx);
455
456 272 uint64_t src_element_index =
457 136 ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index;
458
459 // Pull back src if it would overrun
460 272 uint64_t src_element_base =
461 136 std::min(max_src_base_index, src_element_index);
462
463
0/8
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
272 fill_full_constants_scalarly(constants, in_pixel_index,
464 136 src_element_index, src_element_base,
465 136 sx_fixp);
466 136 handled_full_vector_paths++;
467 136 dst_element_index += kStep;
468 136 }
469 97 }
470
471 // Calculate constants for half vectors
472
473 // Maximum source coordinate for half vector path
474 97 uint64_t half_vector_path_src_read_size =
475 kChannels == 3 ? sizeof(uint8x16x2_t)
476 : (sizeof(uint8x16_t) * (kRatio - 1));
477 194 const uint64_t max_src_base_index = std::max(
478 97 Base::src_width_ * kChannels - half_vector_path_src_read_size, 0UL);
479 // Maximum destination coordinate for half vector path
480 194 const uint64_t max_dst_index_half =
481 97 (Base::dst_width_ * kChannels) - kHalfStep;
482
483
4/4
✓ Branch 0 taken 87 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 138 times.
✓ Branch 3 taken 54 times.
322 for (size_t i = 0;
484 322 i < row_interpolation_constants.num_of_vector_paths().half; ++i) {
485 450 auto &constants =
486 225 row_interpolation_constants.half_vector_constants_array()[i];
487
488 // If (dst index + half vector length) would overrun the buffer, pull it
489 // back
490 225 dst_element_index = std::min(dst_element_index, max_dst_index_half);
491
492 225 uint64_t dx = dst_element_index / kChannels;
493 225 unsigned in_pixel_index = dst_element_index % kChannels;
494
0/4
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
225 sx_fixp = Base::to_src_x(dx);
495 450 uint64_t src_element_index =
496 225 ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index;
497
498 // Pull back src if it would overrun
499 450 uint64_t src_element_base =
500 225 std::min(max_src_base_index, src_element_index);
501
502
0/8
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
450 fill_half_constants_scalarly(constants, dst_element_index, in_pixel_index,
503 225 src_element_index, src_element_base,
504 225 sx_fixp);
505
506 225 dst_element_index += kHalfStep;
507 225 }
508
509 97 return row_interpolation_constants_variant;
510 101 }
511
512 private:
513 83 size_t get_num_of_vector_paths_wout_pullback(
514 size_t num_of_full_vector_constants) {
515 170 auto vector_needs_pullback = [this](size_t dst_idx) {
516 87 unsigned in_pixel_idx = dst_idx % kChannels;
517 87 uint64_t dx = dst_idx / kChannels;
518 87 uint64_t sx_fixp = Base::to_src_x(dx);
519 87 uint64_t src_idx = ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_idx;
520
521 174 return (src_idx + (kStep * kRatio)) > (Base::src_width_ * kChannels);
522 87 };
523
524
2/4
✓ Branch 0 taken 29 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 54 times.
✗ Branch 3 not taken.
83 if (num_of_full_vector_constants == 0) {
525 return 0;
526 }
527
528 166 size_t candidate_last_vector_wout_pullback =
529 83 num_of_full_vector_constants - 1;
530
531 83 do {
532
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 29 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 54 times.
87 if (!vector_needs_pullback(candidate_last_vector_wout_pullback * kStep)) {
533 83 break;
534 }
535 4 candidate_last_vector_wout_pullback--;
536
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
4 } while (candidate_last_vector_wout_pullback > 0);
537
538
2/4
✓ Branch 0 taken 29 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 54 times.
✗ Branch 3 not taken.
83 if (candidate_last_vector_wout_pullback == 0) {
539 if (vector_needs_pullback(candidate_last_vector_wout_pullback * kStep)) {
540 return 0;
541 }
542 }
543
544 83 return candidate_last_vector_wout_pullback + 1;
545 83 }
546
547 80824 uint32x4x4_t gen_vsx_r() {
548 161648 return uint32x4x4_t{
549 323296 Base::scale_x(0), Base::scale_x(0), Base::scale_x(0), Base::scale_x(1),
550 80824 Base::scale_x(1), Base::scale_x(1), Base::scale_x(2), Base::scale_x(2),
551 80824 Base::scale_x(2), Base::scale_x(3), Base::scale_x(3), Base::scale_x(3),
552 80824 Base::scale_x(4), Base::scale_x(4), Base::scale_x(4), Base::scale_x(5)};
553 }
554 80824 uint8x16_t gen_vsx_idx_diff_r() {
555 80824 return uint8x16_t{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0};
556 }
557
558 80824 uint32x4x4_t gen_vsx_g() {
559 161648 return uint32x4x4_t{
560 323296 Base::scale_x(0), Base::scale_x(0), Base::scale_x(1), Base::scale_x(1),
561 80824 Base::scale_x(1), Base::scale_x(2), Base::scale_x(2), Base::scale_x(2),
562 80824 Base::scale_x(3), Base::scale_x(3), Base::scale_x(3), Base::scale_x(4),
563 80824 Base::scale_x(4), Base::scale_x(4), Base::scale_x(5), Base::scale_x(5)};
564 }
565 80824 uint8x16_t gen_vsx_idx_diff_g() {
566 80824 return uint8x16_t{0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1};
567 }
568
569 80824 uint32x4x4_t gen_vsx_b() {
570 161648 return uint32x4x4_t{
571 323296 Base::scale_x(0), Base::scale_x(1), Base::scale_x(1), Base::scale_x(1),
572 80824 Base::scale_x(2), Base::scale_x(2), Base::scale_x(2), Base::scale_x(3),
573 80824 Base::scale_x(3), Base::scale_x(3), Base::scale_x(4), Base::scale_x(4),
574 80824 Base::scale_x(4), Base::scale_x(5), Base::scale_x(5), Base::scale_x(5)};
575 }
576 80824 uint8x16_t gen_vsx_idx_diff_b() {
577 80824 return uint8x16_t{0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
578 }
579
580 242472 void fill_full_constants_vectorially(
581 FullVectorInterpolationConstants &constants, uint32x4x4_t vsx,
582 uint8x16_t vsx_idx_diff, uint64_t sx_fixp, unsigned in_pixel_index) {
583 484944 uint64_t src_element_index_base =
584 242472 ((sx_fixp >> kFixpBits) * kChannels) + in_pixel_index;
585 242472 constants.src_element_index =
586 242472 static_cast<ptrdiff_t>(src_element_index_base);
587
588 // Create x coordinate for all lanes
589 242472 uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp & ((1 << kFixpBits) - 1));
590 242472 uint32x4_t vfrac = vdupq_n_u32(xfrac0);
591 242472 uint8x16x2_t vsx_delta_lo, vsx_delta_hi;
592 242472 vsx_delta_lo.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[0], vfrac));
593 242472 vsx_delta_lo.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[1], vfrac));
594 242472 vsx_delta_hi.val[0] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[2], vfrac));
595 242472 vsx_delta_hi.val[1] = vreinterpretq_u8_u32(vaddq_u32(vsx.val[3], vfrac));
596
597 // Get index from coordinate
598 242472 uint8x8_t idx0 = vqtbl2_u8(vsx_delta_lo, Base::vsidx_tbl_);
599 242472 uint8x8_t idx1 = vqtbl2_u8(vsx_delta_hi, Base::vsidx_tbl_);
600 242472 uint8x16_t vsx0_idx = vcombine_u8(idx0, idx1);
601 // One step in x means 3 steps in elements
602 242472 vsx0_idx = vmulq_u8(vsx0_idx, vdupq_n_u8(3));
603 // Align the stepping if the first lane is green or blue
604 242472 vsx0_idx = vqsubq_u8(vsx0_idx, vdupq_n_u8(in_pixel_index));
605 // Add in-pixel index
606 242472 vsx0_idx = vaddq_u8(vsx0_idx, vsx_idx_diff);
607 242472 vst1q(constants.idx, vsx0_idx);
608
609 // Get fraction from coordinate
610 242472 uint16x8x2_t vsxfrac;
611 242472 vsxfrac.val[0] =
612 242472 vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_lo, Base::vsfrac_tbl_));
613 242472 vsxfrac.val[1] =
614 242472 vreinterpretq_u16_u8(vqtbl2q_u8(vsx_delta_hi, Base::vsfrac_tbl_));
615 242472 VecTraits<uint16_t>::store(vsxfrac, constants.xfrac);
616 242472 }
617
618 136 void fill_full_constants_scalarly(FullVectorInterpolationConstants &constants,
619 unsigned in_pixel_index,
620 uint64_t src_element_index,
621 uint64_t src_element_base,
622 uint64_t sx_fixp) {
623 136 constants.src_element_index = static_cast<ptrdiff_t>(src_element_base);
624
625 272 fill_idx_xfrac(constants, in_pixel_index, src_element_index,
626 136 src_element_base, sx_fixp);
627 136 }
628
629 225 void fill_half_constants_scalarly(HalfVectorInterpolationConstants &constants,
630 uint64_t dst_element_index,
631 unsigned in_pixel_index,
632 uint64_t src_element_index,
633 uint64_t src_element_base,
634 uint64_t sx_fixp) {
635 225 constants.dst_element_index = static_cast<ptrdiff_t>(dst_element_index);
636 225 constants.src_element_index = static_cast<ptrdiff_t>(src_element_base);
637
638 450 fill_idx_xfrac(constants, in_pixel_index, src_element_index,
639 225 src_element_base, sx_fixp);
640 225 }
641
642 template <typename VectorConstants>
643 361 void fill_idx_xfrac(VectorConstants &constants, unsigned in_pixel_index,
644 uint64_t src_element_index, uint64_t src_element_base,
645 uint64_t sx_fixp) {
646 // For indexing inside idx and xfrac arrays of
647 // the interpolation constants
648 361 unsigned j = 0;
649 361 uint8_t idx = (src_element_index - src_element_base);
650 361 uint16_t xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2);
651
652
8/8
✓ Branch 0 taken 185 times.
✓ Branch 1 taken 74 times.
✓ Branch 2 taken 163 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 165 times.
✓ Branch 5 taken 62 times.
✓ Branch 6 taken 279 times.
✓ Branch 7 taken 138 times.
1153 for (; j < (kChannels - in_pixel_index); ++j) {
653 792 constants.idx[j] = idx + j;
654 792 constants.xfrac[j] = xfrac;
655 792 }
656
657 361 sx_fixp += sx_fixp_one_dst_pixel_;
658 361 src_element_index = (sx_fixp >> kFixpBits) * kChannels;
659 361 idx = (src_element_index - src_element_base);
660 361 xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2);
661
662 361 constexpr size_t idx_frac_elem_num = sizeof(VectorConstants::idx);
663
664
8/8
✓ Branch 0 taken 370 times.
✓ Branch 1 taken 74 times.
✓ Branch 2 taken 193 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 310 times.
✓ Branch 5 taken 62 times.
✓ Branch 6 taken 300 times.
✓ Branch 7 taken 138 times.
1534 while (j < idx_frac_elem_num) {
665 // k is the index for the elements in one pixel
666
16/16
✓ Branch 0 taken 74 times.
✓ Branch 1 taken 1295 times.
✓ Branch 2 taken 999 times.
✓ Branch 3 taken 370 times.
✓ Branch 4 taken 87 times.
✓ Branch 5 taken 639 times.
✓ Branch 6 taken 533 times.
✓ Branch 7 taken 193 times.
✓ Branch 8 taken 62 times.
✓ Branch 9 taken 1075 times.
✓ Branch 10 taken 827 times.
✓ Branch 11 taken 310 times.
✓ Branch 12 taken 138 times.
✓ Branch 13 taken 987 times.
✓ Branch 14 taken 825 times.
✓ Branch 15 taken 300 times.
4357 for (unsigned k = 0; (j < idx_frac_elem_num) && (k < kChannels);
667 3184 ++j, ++k) {
668 3184 constants.idx[j] = idx + k;
669 3184 constants.xfrac[j] = xfrac;
670 3184 }
671 1173 sx_fixp += sx_fixp_one_dst_pixel_;
672 1173 src_element_index = (sx_fixp >> kFixpBits) * kChannels;
673 1173 idx = (src_element_index - src_element_base);
674 1173 xfrac = (sx_fixp & ((1 << kFixpBits) - 1)) >> (kFixpBits / 2);
675 }
676 361 }
677
678 static constexpr size_t kChannels = 3;
679 // Difference in source x coordinate for one destination pixel
680 const size_t sx_fixp_one_dst_pixel_;
681 };
682
683 template <ptrdiff_t kRatio, ptrdiff_t kChannels,
684 bool kSetRightmostLanes = false>
685 class ResizeGenericU8Operation final {
686 public:
687 301 ResizeGenericU8Operation(const uint8_t *src, size_t src_stride,
688 size_t src_height, size_t y_begin, size_t y_end,
689 uint8_t *dst, size_t dst_stride, size_t dst_height)
690 301 : src_rows_{src, src_stride, kChannels},
691 301 dst_rows_{dst, dst_stride, kChannels},
692 301 src_height_{src_height},
693 301 y_begin_{y_begin},
694 301 y_end_{y_end},
695 301 dst_height_{dst_height} {}
696
697 301 void process_rows(RowInterpolationConstants &row_interpolation_constants) {
698
14/16
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 448 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 688 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 43 times.
✓ Branch 7 taken 643 times.
✓ Branch 8 taken 48 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 54 times.
✓ Branch 11 taken 572 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 13 times.
✓ Branch 14 taken 46 times.
✓ Branch 15 taken 536 times.
3533 for (uint64_t dst_y = y_begin_; dst_y < y_end_; ++dst_y) {
699 3232 process_row(dst_y, row_interpolation_constants);
700 3232 }
701 301 }
702
703 private:
704 3232 uint64_t to_src_y(uint64_t dy) const {
705 3232 return aligned_scale(dy, src_height_, dst_height_);
706 }
707
708 3232 void process_row(uint64_t dy,
709 RowInterpolationConstants &row_interpolation_constants) {
710 3232 VectorPathNums num_of_vector_paths =
711 3232 row_interpolation_constants.num_of_vector_paths();
712 6464 auto *full_array =
713 3232 row_interpolation_constants.full_vector_constants_array();
714 6464 auto *half_array =
715 3232 row_interpolation_constants.half_vector_constants_array();
716
717 3232 uint64_t sy_fixp = to_src_y(dy);
718 3232 ptrdiff_t sy = static_cast<ptrdiff_t>(sy_fixp >> kFixpBits);
719 3232 const uint8_t *src_top = &src_rows_.at(sy)[0];
720 3232 const uint8_t *src_bottom = &src_rows_.at(sy + 1)[0];
721 3232 uint8_t *dst = &dst_rows_.at(static_cast<ptrdiff_t>(dy))[0];
722 // Get the highest 8 bits of the fractional part
723 // This is a good compromise between accuracy and performance
724 // Because the result is 8bits, the error only affects the least
725 // significant 1-2 bits, see the accuracy calculation in kleidicv.h
726 6464 uint16_t yfrac =
727 3232 static_cast<uint16_t>((sy_fixp - (sy << kFixpBits)) >> (kFixpBits - 8));
728
729 3232 ptrdiff_t dst_element_index = 0;
730
731
14/16
✓ Branch 0 taken 21912 times.
✓ Branch 1 taken 448 times.
✓ Branch 2 taken 45764 times.
✓ Branch 3 taken 688 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 68368 times.
✓ Branch 7 taken 643 times.
✓ Branch 8 taken 20864 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 44414 times.
✓ Branch 11 taken 572 times.
✓ Branch 12 taken 1068 times.
✓ Branch 13 taken 13 times.
✓ Branch 14 taken 67072 times.
✓ Branch 15 taken 536 times.
272694 for (size_t i = 0; i < num_of_vector_paths.two_x; i += 1) {
732 269462 uint8x16x2_t res{};
733 269462 res.val[0] = vector_path(full_array[i * 2], src_top, src_bottom, yfrac);
734 269462 res.val[1] =
735 269462 vector_path(full_array[(i * 2) + 1], src_top, src_bottom, yfrac);
736 269462 VecTraits<uint8_t>::store(res, &dst[dst_element_index]);
737 269462 dst_element_index += kStep * 2;
738 269462 }
739
740
14/16
✓ Branch 0 taken 448 times.
✓ Branch 1 taken 1086 times.
✓ Branch 2 taken 688 times.
✓ Branch 3 taken 1642 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 643 times.
✓ Branch 7 taken 1505 times.
✓ Branch 8 taken 332 times.
✓ Branch 9 taken 880 times.
✓ Branch 10 taken 572 times.
✓ Branch 11 taken 1494 times.
✓ Branch 12 taken 13 times.
✓ Branch 13 taken 39 times.
✓ Branch 14 taken 536 times.
✓ Branch 15 taken 1620 times.
11498 for (size_t i = 0; i < num_of_vector_paths.half; i += 1) {
741 8266 auto res = vector_path_half(half_array[i], yfrac, src_top, src_bottom);
742 8266 vst1(&dst[half_array[i].dst_element_index], res);
743 8266 }
744 3232 }
745
746 8266 uint8x8_t vector_path_half(const HalfVectorInterpolationConstants &constants,
747 uint16_t yfrac, const uint8_t *src_top,
748 const uint8_t *src_bottom) const {
749 8266 uint8x8_t vsx0_idx = vld1_u8(constants.idx);
750 8266 uint8x8_t vsx1_idx = vadd_u8(vsx0_idx, vdup_n_u8(kChannels));
751 8266 uint16x8_t vsxfrac;
752 8266 VecTraits<uint16_t>::load(constants.xfrac, vsxfrac);
753 8266 ptrdiff_t src_element_index = constants.src_element_index;
754
755 using SrcVecType = std::conditional_t<kRatio == 2 && kChannels != 3,
756 uint8x16_t, uint8x16x2_t>;
757 8266 SrcVecType topsrc, bottomsrc;
758 8266 VecTraits<uint8_t>::load(&src_top[src_element_index], topsrc);
759 8266 VecTraits<uint8_t>::load(&src_bottom[src_element_index], bottomsrc);
760
761 8266 uint8x8_t a, b, c, d;
762 if constexpr (kRatio == 2 && kChannels != 3) {
763 2728 a = vqtbl1_u8(topsrc, vsx0_idx);
764 2728 b = vqtbl1_u8(topsrc, vsx1_idx);
765 2728 c = vqtbl1_u8(bottomsrc, vsx0_idx);
766 2728 d = vqtbl1_u8(bottomsrc, vsx1_idx);
767 } else if constexpr (kRatio == 3 || kChannels == 3) {
768 5538 a = vqtbl2_u8(topsrc, vsx0_idx);
769 5538 b = vqtbl2_u8(topsrc, vsx1_idx);
770 5538 c = vqtbl2_u8(bottomsrc, vsx0_idx);
771 5538 d = vqtbl2_u8(bottomsrc, vsx1_idx);
772 }
773 16532 uint8x8_t left =
774 8266 vraddhn_u16(vshll_n_u8(a, 8), vmulq_n_u16(vsubl_u8(c, a), yfrac));
775 16532 uint8x8_t right =
776 8266 vraddhn_u16(vshll_n_u8(b, 8), vmulq_n_u16(vsubl_u8(d, b), yfrac));
777 16532 uint8x8_t res = vraddhn_u16(vshll_n_u8(left, 8),
778 8266 vmulq_u16(vsubl_u8(right, left), vsxfrac));
779 16532 return res;
780 8266 }
781
782 538924 uint8x16_t vector_path(const FullVectorInterpolationConstants &constants,
783 const uint8_t *src_top, const uint8_t *src_bottom,
784 uint16_t yfrac) const {
785 538924 uint8x16_t vsx0_idx = vld1q(constants.idx);
786 538924 uint8x16_t vsx1_idx = vaddq_u8(vsx0_idx, vdupq_n_u8(kChannels));
787 538924 uint16x8x2_t vsxfrac2;
788 538924 VecTraits<uint16_t>::load(constants.xfrac, vsxfrac2);
789 538924 ptrdiff_t src_element_index = constants.src_element_index;
790
791 using SrcVecType =
792 std::conditional_t<kRatio == 2, uint8x16x2_t, uint8x16x3_t>;
793 538924 SrcVecType topsrc, bottomsrc;
794 538924 VecTraits<uint8_t>::load(&src_top[src_element_index], topsrc);
795 538924 VecTraits<uint8_t>::load(&src_bottom[src_element_index], bottomsrc);
796 538924 uint8x16_t a, b, c, d;
797 if constexpr (kRatio == 2) {
798 272088 a = vqtbl2q_u8(topsrc, vsx0_idx);
799 272088 b = vqtbl2q_u8(topsrc, vsx1_idx);
800 272088 c = vqtbl2q_u8(bottomsrc, vsx0_idx);
801 272088 d = vqtbl2q_u8(bottomsrc, vsx1_idx);
802 if constexpr (kSetRightmostLanes) {
803 // table lookup would overindex topsrc and bottomsrc
804 ptrdiff_t last_but_one_right_elem_idx =
805 src_element_index + constants.idx[14] + kChannels;
806 ptrdiff_t last_right_elem_idx =
807 src_element_index + constants.idx[15] + kChannels;
808 b = vsetq_lane_u8(src_top[last_but_one_right_elem_idx], b, 14);
809 b = vsetq_lane_u8(src_top[last_right_elem_idx], b, 15);
810 d = vsetq_lane_u8(src_bottom[last_but_one_right_elem_idx], d, 14);
811 d = vsetq_lane_u8(src_bottom[last_right_elem_idx], d, 15);
812 }
813 } else if constexpr (kRatio == 3) {
814 266836 a = vqtbl3q_u8(topsrc, vsx0_idx);
815 266836 b = vqtbl3q_u8(topsrc, vsx1_idx);
816 266836 c = vqtbl3q_u8(bottomsrc, vsx0_idx);
817 266836 d = vqtbl3q_u8(bottomsrc, vsx1_idx);
818 // table lookup would overindex topsrc and bottomsrc
819 if constexpr (kSetRightmostLanes) {
820 4272 ptrdiff_t last_right_elem_idx =
821 2136 src_element_index + constants.idx[15] + kChannels;
822 2136 b = vsetq_lane_u8(src_top[last_right_elem_idx], b, 15);
823 2136 d = vsetq_lane_u8(src_bottom[last_right_elem_idx], d, 15);
824 2136 }
825 }
826 538924 uint8x8_t left_lo = lerp_low_half(a, c, yfrac);
827 538924 uint8x8_t left_hi = lerp_high_half(a, c, yfrac);
828 538924 uint8x8_t right_lo = lerp_low_half(b, d, yfrac);
829 538924 uint8x8_t right_hi = lerp_high_half(b, d, yfrac);
830 538924 uint8x8_t res_lo = lerp_full(left_lo, right_lo, vsxfrac2.val[0]);
831 538924 uint8x8_t res_hi = lerp_full(left_hi, right_hi, vsxfrac2.val[1]);
832 1077848 return vcombine_u8(res_lo, res_hi);
833 538924 }
834
835 1077848 static uint8x8_t lerp_low_half(uint8x16_t a, uint8x16_t b, uint16_t w) {
836 1077848 return vraddhn_u16(
837 1077848 vshll_n_u8(vget_low_u8(a), 8),
838 1077848 vmulq_n_u16(vsubl_u8(vget_low_u8(b), vget_low_u8(a)), w));
839 }
840
841 1077848 static uint8x8_t lerp_high_half(uint8x16_t a, uint8x16_t b, uint16_t w) {
842 2155696 return vraddhn_u16(vshll_high_n_u8(a, 8),
843 1077848 vmulq_n_u16(vsubl_high_u8(b, a), w));
844 }
845
846 1077848 static uint8x8_t lerp_full(uint8x8_t a, uint8x8_t b, uint16x8_t w) {
847 1077848 return vraddhn_u16(vshll_n_u8(a, 8), vmulq_u16(vsubl_u8(b, a), w));
848 }
849
850 const Rows<const uint8_t> src_rows_;
851 const Rows<uint8_t> dst_rows_;
852 const size_t src_height_;
853 const size_t y_begin_;
854 const size_t y_end_;
855 const size_t dst_height_;
856 };
857
858 } // namespace kleidicv::neon::resize_linear_generic_u8
859