KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/resize/resize_linear_generic_sc.h
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 375 375 100.0%
Functions: 552 552 100.0%
Branches: 128 128 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H
6 #define KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H
7
8 #include <algorithm>
9 #include <cstddef>
10 #include <memory>
11
12 #include "kleidicv/kleidicv.h"
13 #include "kleidicv/sve2.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 //------------------------------------------------------
18 /// Generic resize for ratios 1/3 to 1/1, u8, 1channel
19 //------------------------------------------------------
20
21 namespace resize_generic_u8 {
22
23 // For the coordinate calculation, fixed-point format is used, for better
24 // performance. Fixed-point format:
25 // - lowest 16 bits are the fractional part, that is the kFixpBits constant
26 // - at interpolation, the high 8 bits are used from the fractional part
27 // (this is a good compromise between accuracy and performance: because the
28 // result is 8bits, the error only affects the least significant 1-2 bits, see
29 // the accuracy calculation in kleidicv.h
30 // - to get the integer part, right shift by 16 bits, or zip/unzip/tbl etc. to
31 // get the bytes needed
32 // - for better accuracy, rounding is needed everywhere, i.e. adding 0.5, which
33 // is 1 << 15
34
35 static constexpr ptrdiff_t kFixpBits = 16;
36 static constexpr ptrdiff_t kFixpHalf = (1UL << (kFixpBits - 1));
37
38 // Precalc 1 item:
39 // Frac: 2 vectors u16
40 // Idx: 1 vector u8 (left_idx)
41 // Src_index: uint64 (separate array)
42 template <size_t kRatio>
43 struct PrecalcIterator {
44 size_t index_;
45 uint64_t *src_index_ptr_;
46 const size_t kStep, kIdxFracStep;
47 uint8_t *idx_ptr_;
48 uint16_t *frac_ptr_;
49 5267 PrecalcIterator(size_t kStepDst, uint64_t *src_indices,
50 uint8_t *p_idx_frac) KLEIDICV_STREAMING
51 5267 : index_{0},
52 5267 src_index_ptr_{src_indices},
53 5267 kStep{kStepDst},
54 5267 kIdxFracStep{kStep * (2 + 1)},
55 5267 idx_ptr_{p_idx_frac},
56 5267 frac_ptr_{reinterpret_cast<uint16_t *>(p_idx_frac + kStep)} {}
57
58 769817 PrecalcIterator &operator++() KLEIDICV_STREAMING {
59 769817 ++index_;
60 769817 ++src_index_ptr_;
61 769817 idx_ptr_ += kIdxFracStep;
62 769817 frac_ptr_ += kIdxFracStep / 2;
63 769817 return *this;
64 }
65 };
66
67 template <ptrdiff_t kRatio, ptrdiff_t kChannels>
68 class PrecalcIndicesFractions final {
69 public:
70 457 PrecalcIndicesFractions(size_t src_width, size_t dst_width,
71 ptrdiff_t kStep) KLEIDICV_STREAMING
72 457 : src_width_{src_width},
73 457 dst_width_{dst_width},
74 457 n_iterations_{0},
75 457 n_iterations_2x_{0},
76 457 kStep_{kStep},
77 457 precalc_src_bases_{nullptr, &std::free},
78 457 precalc_idx_frac_{nullptr, &std::free} {}
79
80 5267 PrecalcIterator<kRatio> begin() const KLEIDICV_STREAMING {
81 10534 return PrecalcIterator<kRatio>(kStep_, precalc_src_bases_.get(),
82 5267 precalc_idx_frac_.get());
83 }
84
85 318 bool precalculate_indices_fractions_srcindices() KLEIDICV_STREAMING {
86
8/8
✓ Branch 0 taken 72 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 81 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 72 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 81 times.
✓ Branch 7 taken 3 times.
318 if (!allocate_temp_buffers()) {
87 12 return false;
88 }
89
90 // These starting values are not aligned to center. The center alignment
91 // must be added only once. When added to a center-aligned source_x
92 // value, the result will be center-aligned.
93 306 svuint32_t vsx0b = make_vsx0(0);
94 306 svuint32_t vsx0t = make_vsx0(1);
95 306 svuint32_t vsx1b = make_vsx0(2 * svcntw());
96 306 svuint32_t vsx1t = make_vsx0(2 * svcntw() + 1);
97 // from each even 16bit element, take the low byte, and the high is 0
98 612 svuint8_t vsxfrac_bottom_tbl =
99 306 svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004));
100 // from each odd 16bit element, take the low byte, and the high is 0
101 612 svuint8_t vsxfrac_top_tbl =
102 306 svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004));
103
104 612 svuint8_t vchannels = svreinterpret_u8_u32(
105 306 svdup_n_u32(kChannels == 4 ? 0x03020100U : 0x01000100));
106
107 // Difference in source x coordinate, for one vector path
108 612 const uint64_t sx_fixp_step = rounding_div(
109 306 ((src_width_ * kStep_ / kChannels) << kFixpBits), dst_width_);
110 306 uint64_t sx_fixp = to_src_x(0);
111 612 const uint64_t max_src_index =
112 306 std::max(src_width_ * kChannels - kStep_ * kRatio, 0UL);
113 // For 1,2,4 channels dx can be iterated vector by vector, but not for 3
114 306 ptrdiff_t dx = 0;
115
8/8
✓ Branch 0 taken 29827 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 59729 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 30434 times.
✓ Branch 5 taken 72 times.
✓ Branch 6 taken 61005 times.
✓ Branch 7 taken 81 times.
181301 for (auto pcit = begin(); pcit.index_ < n_iterations_;
116 180995 ++pcit, dx += kStep_ / kChannels) {
117 // Repeatedly adding sx_fixp_vector_step is faster than multiplication,
118 // but it accumulates fixed-point error; periodic recalibration resets
119 // it. The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1
120 // << 16). Only the upper 8 bits of the 16-bit fractional part are used
121 // for interpolation, so once the accumulated error reaches 1 / (1 <<
122 // 8), it can affect later stages. This corresponds to 512 additions,
123 // which is calculated by this mask.
124 180995 constexpr uint64_t kRecalibrateCycleMask = ((1 << 9) - 1);
125
8/8
✓ Branch 0 taken 29699 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 59534 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 30305 times.
✓ Branch 5 taken 129 times.
✓ Branch 6 taken 60808 times.
✓ Branch 7 taken 197 times.
180995 if ((pcit.index_ & kRecalibrateCycleMask) == 0) {
126 649 sx_fixp = to_src_x(dx);
127 649 }
128
129
8/8
✓ Branch 0 taken 29796 times.
✓ Branch 1 taken 31 times.
✓ Branch 2 taken 59668 times.
✓ Branch 3 taken 61 times.
✓ Branch 4 taken 30405 times.
✓ Branch 5 taken 29 times.
✓ Branch 6 taken 60948 times.
✓ Branch 7 taken 57 times.
180995 n_iterations_2x_ = (sx_fixp >> kFixpBits) * kChannels <= max_src_index
130 180817 ? pcit.index_
131 178 : n_iterations_2x_;
132 180995 calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b, vsx0t, vsx1b,
133 vsx1t, vsxfrac_bottom_tbl,
134 vsxfrac_top_tbl, vchannels);
135 180995 sx_fixp += sx_fixp_step;
136 180995 }
137 306 return true;
138 318 }
139
140 139 bool precalculate_indices_fractions_srcindices_3ch() KLEIDICV_STREAMING {
141
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 69 times.
139 if (!allocate_temp_buffers()) {
142 6 return false;
143 }
144
145 // These starting values are not aligned to center. The center alignment
146 // must be added only once. When added to a center-aligned source_x
147 // value, the result will be center-aligned.
148 133 svuint32_t vsx0b_R = make_vsx0(0);
149 133 svuint32_t vsx0t_R = make_vsx0(1);
150 133 svuint32_t vsx1b_R = make_vsx0(2 * svcntw());
151 133 svuint32_t vsx1t_R = make_vsx0(2 * svcntw() + 1);
152
153 133 svuint32_t vsx0b_G = make_vsx0(4 * svcntw());
154 133 svuint32_t vsx0t_G = make_vsx0(4 * svcntw() + 1);
155 133 svuint32_t vsx1b_G = make_vsx0(6 * svcntw());
156 133 svuint32_t vsx1t_G = make_vsx0(6 * svcntw() + 1);
157
158 133 svuint32_t vsx0b_B = make_vsx0(8 * svcntw());
159 133 svuint32_t vsx0t_B = make_vsx0(8 * svcntw() + 1);
160 133 svuint32_t vsx1b_B = make_vsx0(10 * svcntw());
161 133 svuint32_t vsx1t_B = make_vsx0(10 * svcntw() + 1);
162
163 133 size_t kVL = svcntb();
164 133 svuint8_t vchannels_R = svindex_u8(0, 1);
165 133 svuint8_t vchannels_G = svindex_u8(kVL % 3, 1);
166 133 svuint8_t vchannels_B = svindex_u8((kVL + kVL) % 3, 1);
167 // Decrease by 3 while they are >= 3 --> so we get the modulo
168 133 size_t steps = (kVL - 1) / 3;
169
4/4
✓ Branch 0 taken 992 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 1081 times.
✓ Branch 3 taken 69 times.
2206 for (size_t i = 0; i < steps; ++i) {
170 4146 vchannels_R = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_R, 3),
171 2073 vchannels_R, 3);
172 4146 vchannels_G = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_G, 3),
173 2073 vchannels_G, 3);
174 4146 vchannels_B = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), vchannels_B, 3),
175 2073 vchannels_B, 3);
176 2073 }
177
178 // from each even 16bit element, take the low byte, and the high is 0
179 266 svuint8_t vsxfrac_bottom_tbl =
180 133 svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004));
181 // from each odd 16bit element, take the low byte, and the high is 0
182 266 svuint8_t vsxfrac_top_tbl =
183 133 svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004));
184
185 // Difference in source x coordinate, for three vector paths (one iteration
186 // in this calculation)
187 266 const uint64_t sx_fixp_step3 =
188 133 rounding_div((src_width_ * kStep_) << kFixpBits, dst_width_);
189 133 uint64_t sx_fixp = to_src_x(0);
190 266 const uint64_t max_src_index =
191 133 std::max(src_width_ * kChannels - kStep_ * kRatio, 0UL);
192 133 ptrdiff_t dx = 0;
193 133 auto pcit = begin();
194
4/4
✓ Branch 0 taken 29 times.
✓ Branch 1 taken 29837 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 30590 times.
60474 while (pcit.index_ < n_iterations_) {
195 // Repeatedly adding sx_fixp_vector_step is faster than multiplication,
196 // but it accumulates fixed-point error; periodic recalibration resets
197 // it. The maximum per-addition error of sx_fixp_vector_step is 0.5 / (1
198 // << 16). Only the upper 8 bits of the 16-bit fractional part are used
199 // for interpolation, so once the accumulated error reaches 1 / (1 <<
200 // 8), it can affect later stages. This corresponds to 512 additions,
201 // but it will trigger each 3rd time, so the mask should be set to 128.
202 60427 constexpr uint64_t kRecalibrateCycleMask = ((1 << 7) - 1);
203
4/4
✓ Branch 0 taken 29544 times.
✓ Branch 1 taken 293 times.
✓ Branch 2 taken 30286 times.
✓ Branch 3 taken 304 times.
60427 if ((pcit.index_ & kRecalibrateCycleMask) == 0) {
204 597 sx_fixp = to_src_x(dx);
205 597 }
206
207 60427 calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_R, vsx0t_R,
208 vsx1b_R, vsx1t_R, vsxfrac_bottom_tbl,
209 vsxfrac_top_tbl, vchannels_R);
210
4/4
✓ Branch 0 taken 29822 times.
✓ Branch 1 taken 15 times.
✓ Branch 2 taken 30580 times.
✓ Branch 3 taken 10 times.
60427 n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index
211 60402 ? pcit.index_
212 25 : n_iterations_2x_;
213 60427 ++pcit;
214
4/4
✓ Branch 0 taken 29 times.
✓ Branch 1 taken 29808 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 30580 times.
60427 if (pcit.index_ >= n_iterations_) {
215 39 break;
216 }
217 60388 calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_G, vsx0t_G,
218 vsx1b_G, vsx1t_G, vsxfrac_bottom_tbl,
219 vsxfrac_top_tbl, vchannels_G);
220
4/4
✓ Branch 0 taken 29790 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 30537 times.
✓ Branch 3 taken 43 times.
60388 n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index
221 60327 ? pcit.index_
222 61 : n_iterations_2x_;
223 60388 ++pcit;
224
4/4
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 29802 times.
✓ Branch 2 taken 41 times.
✓ Branch 3 taken 30539 times.
60388 if (pcit.index_ >= n_iterations_) {
225 47 break;
226 }
227 60341 calculate_indices_fractions_srcindex(pcit, sx_fixp, vsx0b_B, vsx0t_B,
228 vsx1b_B, vsx1t_B, vsxfrac_bottom_tbl,
229 vsxfrac_top_tbl, vchannels_B);
230
4/4
✓ Branch 0 taken 29771 times.
✓ Branch 1 taken 31 times.
✓ Branch 2 taken 30519 times.
✓ Branch 3 taken 20 times.
60341 n_iterations_2x_ = *pcit.src_index_ptr_ <= max_src_index
231 60290 ? pcit.index_
232 51 : n_iterations_2x_;
233 60341 ++pcit;
234 60341 sx_fixp += sx_fixp_step3;
235 60341 dx += kStep_;
236 60427 }
237 133 return true;
238 139 }
239
240 17206 size_t n_iterations() const KLEIDICV_STREAMING { return n_iterations_; }
241 202472 size_t n_iterations_2x() const KLEIDICV_STREAMING { return n_iterations_2x_; }
242 uint64_t *src_bases() const KLEIDICV_STREAMING {
243 return precalc_src_bases_.get();
244 }
245 uint8_t *idx_frac() const KLEIDICV_STREAMING {
246 return precalc_idx_frac_.get();
247 }
248
249 private:
250 using FreeDeleter = decltype(&std::free);
251
252 457 bool allocate_temp_buffers() KLEIDICV_STREAMING {
253 // Allocate a bit more so don't have to care about overindexing
254 457 ptrdiff_t rounded_width = align_up(dst_width_ * kChannels, kStep_);
255 457 n_iterations_ = rounded_width / kStep_;
256 457 size_t idx_bytes = sizeof(uint8_t) * rounded_width;
257 457 size_t xfrac_bytes = sizeof(uint16_t) * rounded_width;
258 914 precalc_idx_frac_.reset(
259 457 static_cast<uint8_t *>(malloc(idx_bytes + xfrac_bytes)));
260 457 size_t src_bases_bytes = sizeof(uint64_t) * rounded_width / kStep_;
261 457 precalc_src_bases_.reset(static_cast<uint64_t *>(malloc(src_bases_bytes)));
262 1371 return (reinterpret_cast<uintptr_t>(precalc_idx_frac_.get()) &
263 457 reinterpret_cast<uintptr_t>(precalc_src_bases_.get()));
264 457 }
265
266 template <typename T = uint64_t>
267 35868 static T rounding_div(uint64_t nom, uint64_t denom) KLEIDICV_STREAMING {
268 35868 return static_cast<T>((nom + denom / 2) / denom);
269 }
270
271 // Scale coordinate using this formula, so the center is aligned:
272 // source_x = (destination_x + 0.5) / scale - 0.5;
273 // plus 1/256/2 for later rounding the fractional part to 8bits
274 1685 static uint64_t aligned_scale(uint64_t x, uint64_t nom,
275 uint64_t denom) KLEIDICV_STREAMING {
276 3370 return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) -
277 1685 kFixpHalf + (1 << (kFixpBits - 9));
278 }
279
280 1685 uint64_t to_src_x(uint64_t dx) const KLEIDICV_STREAMING {
281 1685 return aligned_scale(dx, src_width_, dst_width_);
282 }
283
284 // Scale destination x coordinate to source x coordinate, into fixed-point,
285 // without center correction
286 33744 uint32_t scale_x(uint64_t dx) const KLEIDICV_STREAMING {
287 33744 return rounding_div<uint32_t>(((dx * src_width_) << kFixpBits), dst_width_);
288 }
289
290 2820 svuint32_t make_vsx0(uint64_t dx) const KLEIDICV_STREAMING {
291 // Creates source x coordinates starting with dx, stepping by 2
292 // and finally shifted left by 8, to support the later svaddhn operation
293 2820 uint32_t sx[64]; // maximum possible vector length in u32 units
294
12/12
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 3456 times.
✓ Branch 2 taken 324 times.
✓ Branch 3 taken 3888 times.
✓ Branch 4 taken 768 times.
✓ Branch 5 taken 9120 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 3456 times.
✓ Branch 8 taken 324 times.
✓ Branch 9 taken 3888 times.
✓ Branch 10 taken 828 times.
✓ Branch 11 taken 9936 times.
36564 for (size_t i = 0; i < svcntw(); ++i) {
295 33744 sx[i] = scale_x((dx + 2 * i) / kChannels) << 8;
296 33744 }
297 5640 return svld1(svptrue_b32(), sx);
298 2820 }
299
300 362151 void calculate_indices_fractions_srcindex(
301 PrecalcIterator<kRatio> &pcit, uint64_t sx_fixp, const svuint32_t &vsx0b,
302 const svuint32_t &vsx0t, const svuint32_t &vsx1b, const svuint32_t &vsx1t,
303 const svuint8_t &vsxfrac_bottom_tbl, const svuint8_t &vsxfrac_top_tbl,
304 [[maybe_unused]] const svuint8_t &vchannels) const KLEIDICV_STREAMING {
305 // << 8: to prepare for addhn, have the fractional part in the high half
306 724302 uint32_t xfrac0 = static_cast<uint32_t>(sx_fixp & ((1 << kFixpBits) - 1))
307 362151 << 8;
308 // get the interesting part: 8+8 bits of integer and fractional part
309 724302 svuint16x2_t vsx_delta =
310 724302 svcreate2(svaddhnt_n_u32(svaddhnb_n_u32(vsx0b, xfrac0), vsx0t, xfrac0),
311 362151 svaddhnt_n_u32(svaddhnb_n_u32(vsx1b, xfrac0), vsx1t, xfrac0));
312 if constexpr (kChannels == 3) {
313 // When vsx0 starts from other than zero, this offset must be subtracted
314 181156 uint16_t start{};
315 181156 svst1(svptrue_pat_b16(SV_VL1), &start, svget2(vsx_delta, 0));
316 181156 start = start & 0xFF00;
317 181156 vsx_delta =
318 362312 svcreate2(svsub_n_u16_x(svptrue_b16(), svget2(vsx_delta, 0), start),
319 181156 svsub_n_u16_x(svptrue_b16(), svget2(vsx_delta, 1), start));
320 181156 sx_fixp += (start >> 8) << kFixpBits;
321 181156 }
322 724302 svuint8x2_t vsx_delta8 =
323 724302 svcreate2(svreinterpret_u8_u16(svget2(vsx_delta, 0)),
324 362151 svreinterpret_u8_u16(svget2(vsx_delta, 1)));
325 // left pixels' indices: integer part
326 724302 svuint8_t vsx_left_idx =
327 362151 svuzp2_u8(svget2(vsx_delta8, 0), svget2(vsx_delta8, 1));
328 if constexpr (kChannels > 1) {
329 if constexpr (kChannels == 3) {
330 181156 vsx_left_idx = svmul_n_u8_x(svptrue_b8(), vsx_left_idx, 3);
331 } else {
332 static_assert(kChannels == 2 || kChannels == 4);
333 120734 vsx_left_idx =
334 120734 svlsl_n_u8_x(svptrue_b8(), vsx_left_idx, kChannels == 4 ? 2 : 1);
335 }
336 301890 vsx_left_idx = svadd_u8_x(svptrue_b8(), vsx_left_idx, vchannels);
337 }
338
339 362151 uint64_t srcindex = (sx_fixp >> kFixpBits) * kChannels;
340 if constexpr (kChannels == 3) {
341 // When vsx_left_idx starts from other than zero, this offset must be
342 // subtracted
343 181156 uint8_t start{};
344 181156 svst1(svptrue_pat_b8(SV_VL1), &start, vsx_left_idx);
345 181156 vsx_left_idx = svsub_n_u8_x(svptrue_b8(), vsx_left_idx, start);
346 181156 srcindex += start;
347 181156 }
348
349 362151 *pcit.src_index_ptr_ = srcindex;
350 362151 svst1(svptrue_b8(), pcit.idx_ptr_, vsx_left_idx);
351
352 // fractional part is widened to 16 bits for further operations
353 724302 svuint16_t vsxfrac_b =
354 362151 svreinterpret_u16_u8(svtbl2_u8(vsx_delta8, vsxfrac_bottom_tbl));
355 724302 svuint16_t vsxfrac_t =
356 362151 svreinterpret_u16_u8(svtbl2_u8(vsx_delta8, vsxfrac_top_tbl));
357 362151 svst1(svptrue_b16(), pcit.frac_ptr_, vsxfrac_b);
358 362151 svst1_vnum(svptrue_b16(), pcit.frac_ptr_, 1, vsxfrac_t);
359 362151 }
360
361 const size_t src_width_;
362 const size_t dst_width_;
363 size_t n_iterations_;
364 size_t n_iterations_2x_;
365 const ptrdiff_t kStep_;
366 std::unique_ptr<uint64_t, FreeDeleter> precalc_src_bases_;
367 std::unique_ptr<uint8_t, FreeDeleter> precalc_idx_frac_;
368 };
369
370 // ratio: number of vectors to load and resize to 1 vector
371 // - supported combinations of (ratio, channel):
372 // (2, 1), (2, 2), (2, 3), (3, 1), (3, 2), (3, 3)
373 template <ptrdiff_t kRatio, ptrdiff_t kChannels>
374 class ResizeGenericU8Operation final {
375 public:
376 457 ResizeGenericU8Operation(const uint8_t *src, size_t src_stride,
377 size_t src_width, size_t src_height, size_t y_begin,
378 size_t y_end,
379 uint8_t *dst, // NOLINT
380 size_t dst_stride, size_t dst_width,
381 size_t dst_height) KLEIDICV_STREAMING
382 457 : src_rows_{src, src_stride, kChannels},
383 457 dst_rows_{dst, dst_stride, kChannels},
384 457 src_width_{src_width},
385 457 src_height_{src_height},
386 457 y_begin_{y_begin},
387 457 y_end_{y_end},
388 457 dst_width_{dst_width},
389 457 dst_height_{dst_height},
390 457 kStep_{static_cast<ptrdiff_t>(svcntb())},
391 457 precalc_{src_width, dst_width, kStep_} {}
392
393 457 kleidicv_error_t process_rows() KLEIDICV_STREAMING {
394 457 bool precalc_success = false;
395 if constexpr (kChannels == 3) {
396 139 precalc_success =
397 139 precalc_.precalculate_indices_fractions_srcindices_3ch();
398 } else {
399 318 precalc_success = precalc_.precalculate_indices_fractions_srcindices();
400 }
401
12/12
✓ Branch 0 taken 72 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 81 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 64 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 72 times.
✓ Branch 7 taken 3 times.
✓ Branch 8 taken 81 times.
✓ Branch 9 taken 3 times.
✓ Branch 10 taken 69 times.
✓ Branch 11 taken 3 times.
457 if (!precalc_success) {
402 18 return KLEIDICV_ERROR_ALLOCATION;
403 }
404
405
12/12
✓ Branch 0 taken 672 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1032 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 964 times.
✓ Branch 5 taken 64 times.
✓ Branch 6 taken 498 times.
✓ Branch 7 taken 72 times.
✓ Branch 8 taken 858 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 804 times.
✓ Branch 11 taken 69 times.
5267 for (uint64_t dst_y = y_begin_; dst_y < y_end_; ++dst_y) {
406 4828 process_row(dst_y);
407 4828 }
408
409 439 return KLEIDICV_OK;
410 457 }
411
412 private:
413 template <typename T = uint64_t>
414 4828 static T rounding_div(uint64_t nom, uint64_t denom) KLEIDICV_STREAMING {
415 4828 return static_cast<T>((nom + denom / 2) / denom);
416 }
417
418 // Scale coordinate using this formula, so the center is aligned:
419 // source_x = (destination_x + 0.5) / scale - 0.5;
420 // plus 1/256/2 for later rounding the fractional part to 8bits
421 4828 static uint64_t aligned_scale(uint64_t x, uint64_t nom,
422 uint64_t denom) KLEIDICV_STREAMING {
423 9656 return rounding_div(((x << kFixpBits) + kFixpHalf) * nom, denom) -
424 4828 kFixpHalf + (1 << (kFixpBits - 9));
425 }
426
427 4828 uint64_t to_src_y(uint64_t dy) const KLEIDICV_STREAMING {
428 4828 return aligned_scale(dy, src_height_, dst_height_);
429 }
430
431 1222998 static svuint16_t svshll8b(svuint8_t a) KLEIDICV_STREAMING {
432 1222998 return svreinterpret_u16_u8(svtrn1(svdup_n_u8(0), a));
433 }
434 1222998 static svuint16_t svshll8t(svuint8_t a) KLEIDICV_STREAMING {
435 1222998 return svreinterpret_u16_u8(svtrn2(svdup_n_u8(0), a));
436 }
437
438 398404 static svuint8x2_t load8x2_u8(const uint8_t *p) KLEIDICV_STREAMING {
439 #if KLEIDICV_TARGET_SME2
440 64120 return svld1_x2(svptrue_c8(), p);
441 #else
442 334284 return svcreate2(svld1(svptrue_b8(), p), svld1_vnum(svptrue_b8(), p, 1));
443 #endif
444 }
445
446 15268 svuint8x2_t load8x2_while_u8(const uint8_t *p, uint64_t i,
447 uint64_t n) const KLEIDICV_STREAMING {
448 #if KLEIDICV_TARGET_SME2
449 5264 return svld1_x2(svwhilelt_c8(i, n, 2), p);
450 #else
451 10004 svbool_t pg1 = svwhilelt_b8(i, n);
452 10004 svbool_t pg2 = svwhilelt_b8(i + kStep_, n);
453 20008 return svcreate2(svld1(pg1, p), svld1_vnum(pg2, p, 1));
454 #endif
455 10004 }
456
457 392172 static svuint8x3_t load8x3_u8(const uint8_t *p) KLEIDICV_STREAMING {
458 #if KLEIDICV_TARGET_SME2
459 64196 svuint8x2_t sv2 = svld1_x2(svptrue_c8(), p);
460 192588 return svcreate3(svget2(sv2, 0), svget2(sv2, 1),
461 64196 svld1_vnum(svptrue_b8(), p, 2));
462 #else
463 655952 return svcreate3(svld1(svptrue_b8(), p), svld1_vnum(svptrue_b8(), p, 1),
464 327976 svld1_vnum(svptrue_b8(), p, 2));
465 #endif
466 64196 }
467
468 9488 svuint8x3_t load8x3_while_u8(const uint8_t *p, uint64_t i,
469 uint64_t n) const KLEIDICV_STREAMING {
470 #if KLEIDICV_TARGET_SME2
471 3050 svcount_t pgc = svwhilelt_c8(i, n, 2);
472 3050 svbool_t pgb = svwhilelt_b8(i + 2 * kStep_, n);
473 3050 svuint8x2_t sv2 = svld1_x2(pgc, p);
474 6100 return svcreate3(svget2(sv2, 0), svget2(sv2, 1), svld1_vnum(pgb, p, 2));
475 #else
476 6438 svbool_t pg1 = svwhilelt_b8(i, n);
477 6438 svbool_t pg2 = svwhilelt_b8(i + kStep_, n);
478 6438 svbool_t pg3 = svwhilelt_b8(i + 2 * kStep_, n);
479 19314 return svcreate3(svld1(pg1, p), svld1_vnum(pg2, p, 1),
480 6438 svld1_vnum(pg3, p, 2));
481 #endif
482 9488 }
483
484 407666 svuint8_t interpolate(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac,
485 svuint8_t a, svuint8_t b, svuint8_t c,
486 svuint8_t d) const KLEIDICV_STREAMING {
487 #if KLEIDICV_TARGET_SME2
488 68315 svuint16x2_t vsxfrac = svld1_x2(svptrue_c8(), pcit.frac_ptr_);
489 68315 svuint16_t vsxfrac_b = svget2(vsxfrac, 0);
490 68315 svuint16_t vsxfrac_t = svget2(vsxfrac, 1);
491 #else
492 339351 svuint16_t vsxfrac_b = svld1(svptrue_b16(), pcit.frac_ptr_);
493 339351 svuint16_t vsxfrac_t = svld1_vnum(svptrue_b16(), pcit.frac_ptr_, 1);
494 #endif
495 407666 svuint16_t half = svdup_n_u16(128);
496 815332 svuint8_t left = svaddhnb(
497 407666 svshll8b(a), svmla_n_u16_x(svptrue_b16(), half, svsublb(c, a), yfrac));
498 815332 svuint8_t right = svaddhnb(
499 407666 svshll8b(b), svmla_n_u16_x(svptrue_b16(), half, svsublb(d, b), yfrac));
500 815332 left = svaddhnt(left, svshll8t(a),
501 407666 svmla_n_u16_x(svptrue_b16(), half, svsublt(c, a), yfrac));
502 815332 right = svaddhnt(right, svshll8t(b),
503 407666 svmla_n_u16_x(svptrue_b16(), half, svsublt(d, b), yfrac));
504
505 815332 svuint8_t res =
506 815332 svaddhnb(svshll8b(left),
507 407666 svmla_x(svptrue_b16(), half, svsublb(right, left), vsxfrac_b));
508 815332 return svaddhnt(
509 407666 res, svshll8t(left),
510 407666 svmla_x(svptrue_b16(), half, svsublt(right, left), vsxfrac_t));
511 407666 }
512
513 206836 svuint8_t common_vector_path_r2(
514 const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, svuint8x2_t topsrc,
515 svuint8x2_t bottomsrc) const KLEIDICV_STREAMING {
516 206836 svuint8_t vsx0_idx = svld1(svptrue_b8(), pcit.idx_ptr_);
517 206836 svuint8_t vsx1_idx = svadd_n_u8_x(svptrue_b8(), vsx0_idx, kChannels);
518 206836 svuint8_t a = svtbl2_u8(topsrc, vsx0_idx);
519 206836 svuint8_t b = svtbl2_u8(topsrc, vsx1_idx);
520 206836 svuint8_t c = svtbl2_u8(bottomsrc, vsx0_idx);
521 206836 svuint8_t d = svtbl2_u8(bottomsrc, vsx1_idx);
522 413672 return interpolate(pcit, yfrac, a, b, c, d);
523 206836 }
524
525 199202 svuint8_t vector_path_r2(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac,
526 const uint8_t *src_top,
527 const uint8_t *src_bottom) const KLEIDICV_STREAMING {
528 // Load 2*step elements, that's enough for 1/2 < scale < 1.0
529 199202 uint64_t src_index = *pcit.src_index_ptr_;
530 199202 svuint8x2_t topsrc = load8x2_u8(&src_top[src_index]);
531 199202 svuint8x2_t bottomsrc = load8x2_u8(&src_bottom[src_index]);
532 398404 return common_vector_path_r2(pcit, yfrac, topsrc, bottomsrc);
533 199202 }
534
535 7634 svuint8_t remaining_path_r2(const PrecalcIterator<kRatio> &pcit,
536 uint16_t yfrac, const uint8_t *src_top,
537 const uint8_t *src_bottom) const
538 KLEIDICV_STREAMING {
539 // Load 2*step elements, that's enough for 1/2 < scale < 1.0
540 7634 uint64_t src_index = *pcit.src_index_ptr_;
541 15268 svuint8x2_t topsrc = load8x2_while_u8(&src_top[src_index], src_index,
542 7634 src_width_ * kChannels);
543 15268 svuint8x2_t bottomsrc = load8x2_while_u8(&src_bottom[src_index], src_index,
544 7634 src_width_ * kChannels);
545 15268 return common_vector_path_r2(pcit, yfrac, topsrc, bottomsrc);
546 7634 }
547
548 200830 svuint8_t common_vector_path_r3(
549 const PrecalcIterator<kRatio> &pcit, uint16_t yfrac, svuint8x3_t topsrc,
550 svuint8x3_t bottomsrc) const KLEIDICV_STREAMING {
551 200830 svuint8_t vsx0_idx = svld1(svptrue_b8(), pcit.idx_ptr_);
552 200830 svuint8_t vsx1_idx = svadd_n_u8_x(svptrue_b8(), vsx0_idx, kChannels);
553 401660 svuint8_t a =
554 200830 svtbl2_u8(svcreate2(svget3(topsrc, 0), svget3(topsrc, 1)), vsx0_idx);
555 401660 svuint8_t b =
556 200830 svtbl2_u8(svcreate2(svget3(topsrc, 0), svget3(topsrc, 1)), vsx1_idx);
557 602490 svuint8_t c = svtbl2_u8(
558 401660 svcreate2(svget3(bottomsrc, 0), svget3(bottomsrc, 1)), vsx0_idx);
559 602490 svuint8_t d = svtbl2_u8(
560 401660 svcreate2(svget3(bottomsrc, 0), svget3(bottomsrc, 1)), vsx1_idx);
561
562 200830 vsx0_idx =
563 200830 svsub_n_u8_x(svptrue_b8(), vsx0_idx, static_cast<uint8_t>(2 * kStep_));
564 200830 vsx1_idx =
565 200830 svsub_n_u8_x(svptrue_b8(), vsx1_idx, static_cast<uint8_t>(2 * kStep_));
566 200830 a = svtbx_u8(a, svget3(topsrc, 2), vsx0_idx);
567 200830 b = svtbx_u8(b, svget3(topsrc, 2), vsx1_idx);
568 200830 c = svtbx_u8(c, svget3(bottomsrc, 2), vsx0_idx);
569 200830 d = svtbx_u8(d, svget3(bottomsrc, 2), vsx1_idx);
570 401660 return interpolate(pcit, yfrac, a, b, c, d);
571 200830 }
572
573 196086 svuint8_t vector_path_r3(const PrecalcIterator<kRatio> &pcit, uint16_t yfrac,
574 const uint8_t *src_top,
575 const uint8_t *src_bottom) const KLEIDICV_STREAMING {
576 // Load 3*2*step elements, that's enough for 1/3 < scale < 1.0
577 196086 uint64_t src_index = *pcit.src_index_ptr_;
578 196086 svuint8x3_t topsrc = load8x3_u8(&src_top[src_index]);
579 196086 svuint8x3_t bottomsrc = load8x3_u8(&src_bottom[src_index]);
580 392172 return common_vector_path_r3(pcit, yfrac, topsrc, bottomsrc);
581 196086 }
582
583 4744 svuint8_t remaining_path_r3(const PrecalcIterator<kRatio> &pcit,
584 uint16_t yfrac, const uint8_t *src_top,
585 const uint8_t *src_bottom) const
586 KLEIDICV_STREAMING {
587 // Load 3*step elements, that's enough for 1/3 < scale < 1.0
588 4744 uint64_t src_index = *pcit.src_index_ptr_;
589 9488 svuint8x3_t topsrc = load8x3_while_u8(&src_top[src_index], src_index,
590 4744 src_width_ * kChannels);
591 9488 svuint8x3_t bottomsrc = load8x3_while_u8(&src_bottom[src_index], src_index,
592 4744 src_width_ * kChannels);
593 9488 return common_vector_path_r3(pcit, yfrac, topsrc, bottomsrc);
594 4744 }
595
596 4828 void process_row(uint64_t dy) const KLEIDICV_STREAMING {
597 4828 uint64_t sy_fixp = to_src_y(dy);
598 4828 ptrdiff_t sy = static_cast<ptrdiff_t>(sy_fixp >> kFixpBits);
599 4828 const uint8_t *src_top = &src_rows_.at(sy)[0];
600 4828 const uint8_t *src_bottom = &src_rows_.at(sy + 1)[0];
601 4828 uint8_t *dst = &dst_rows_.at(static_cast<ptrdiff_t>(dy))[0];
602 4828 uint8_t *dst_end = dst + dst_width_ * kChannels;
603 // Get the highest 8 bits of the fractional part
604 // This is a good compromise between accuracy and performance
605 // Because the result is 8bits, the error only affects the least
606 // significant 1-2 bits, see the accuracy calculation in kleidicv.h
607 9656 uint16_t yfrac =
608 4828 static_cast<uint16_t>((sy_fixp - (sy << kFixpBits)) >> (kFixpBits - 8));
609 4828 auto pcit = precalc_.begin();
610
12/12
✓ Branch 0 taken 15815 times.
✓ Branch 1 taken 672 times.
✓ Branch 2 taken 33519 times.
✓ Branch 3 taken 1032 times.
✓ Branch 4 taken 50267 times.
✓ Branch 5 taken 964 times.
✓ Branch 6 taken 15414 times.
✓ Branch 7 taken 498 times.
✓ Branch 8 taken 32692 times.
✓ Branch 9 taken 858 times.
✓ Branch 10 taken 49937 times.
✓ Branch 11 taken 804 times.
202472 while (pcit.index_ + 1 < precalc_.n_iterations_2x()) {
611 197644 svuint8_t res0, res1;
612 if constexpr (kRatio == 3) {
613 98043 res0 = vector_path_r3(pcit, yfrac, src_top, src_bottom);
614 98043 ++pcit;
615 98043 res1 = vector_path_r3(pcit, yfrac, src_top, src_bottom);
616 98043 ++pcit;
617 } else if constexpr (kRatio == 2) {
618 99601 res0 = vector_path_r2(pcit, yfrac, src_top, src_bottom);
619 99601 ++pcit;
620 99601 res1 = vector_path_r2(pcit, yfrac, src_top, src_bottom);
621 99601 ++pcit;
622 }
623 #if KLEIDICV_TARGET_SME2
624 32079 svst1(svptrue_c8(), dst, svcreate2(res0, res1));
625 #else
626 165565 svst1(svptrue_b8(), dst, res0);
627 165565 svst1_vnum(svptrue_b8(), dst, 1, res1);
628 #endif // KLEIDICV_TARGET_SME2
629 197644 dst += 2 * kStep_;
630 197644 }
631
632 // similar to above, but only a single vector path and with predicates
633
12/12
✓ Branch 0 taken 1895 times.
✓ Branch 1 taken 672 times.
✓ Branch 2 taken 2635 times.
✓ Branch 3 taken 1032 times.
✓ Branch 4 taken 3104 times.
✓ Branch 5 taken 964 times.
✓ Branch 6 taken 968 times.
✓ Branch 7 taken 498 times.
✓ Branch 8 taken 2062 times.
✓ Branch 9 taken 858 times.
✓ Branch 10 taken 1714 times.
✓ Branch 11 taken 804 times.
17206 while (pcit.index_ < precalc_.n_iterations()) {
634 12378 svbool_t pgdst = svwhilelt_b8(0L, dst_end - dst);
635 12378 svuint8_t res;
636 if constexpr (kRatio == 2) {
637 7634 res = remaining_path_r2(pcit, yfrac, src_top, src_bottom);
638 } else if constexpr (kRatio == 3) {
639 4744 res = remaining_path_r3(pcit, yfrac, src_top, src_bottom);
640 }
641 12378 svst1(pgdst, dst, res);
642 12378 ++pcit;
643 12378 dst += kStep_;
644 12378 }
645 4828 }
646
647 const Rows<const uint8_t> src_rows_;
648 const Rows<uint8_t> dst_rows_;
649 const size_t src_width_;
650 const size_t src_height_;
651 const size_t y_begin_;
652 const size_t y_end_;
653 const size_t dst_width_;
654 const size_t dst_height_;
655 const ptrdiff_t kStep_;
656 PrecalcIndicesFractions<kRatio, kChannels> precalc_;
657 };
658
659 } // namespace resize_generic_u8
660
661 // ratio: number of vectors to load and resize to 1 vector
662 // - supported combinations of (ratio, channel): (2, 1), (2, 2), (3, 1), (3,
663 // 2)
664 template <ptrdiff_t kRatio, ptrdiff_t kChannels>
665 457 kleidicv_error_t kleidicv_resize_generic_stripe_u8_sc(
666 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
667 size_t y_begin, size_t y_end,
668 uint8_t *dst, // NOLINT
669 size_t dst_stride, size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
670 914 resize_generic_u8::ResizeGenericU8Operation<kRatio, kChannels> operation(
671 457 src, src_stride, src_width, src_height, y_begin, y_end, dst, dst_stride,
672 457 dst_width, dst_height);
673 457 return operation.process_rows();
674 457 }
675
676 #define KLEIDICV_INSTANTIATE_TEMPLATE_SC(ratio, channels) \
677 template kleidicv_error_t \
678 kleidicv_resize_generic_stripe_u8_sc<ratio, channels>( \
679 const uint8_t *src, size_t src_stride, size_t src_width, \
680 size_t src_height, size_t y_begin, size_t y_end, uint8_t *dst, \
681 size_t dst_stride, size_t dst_width, size_t dst_height) \
682 KLEIDICV_STREAMING
683
684 KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 1L);
685 KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 2L);
686 KLEIDICV_INSTANTIATE_TEMPLATE_SC(2L, 3L);
687 KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 1L);
688 KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 2L);
689 KLEIDICV_INSTANTIATE_TEMPLATE_SC(3L, 3L);
690
691 } // namespace KLEIDICV_TARGET_NAMESPACE
692
693 #endif // KLEIDICV_RESIZE_LINEAR_GENERIC_SC_H
694