KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv422_to_rgb_sc.h
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 240 240 100.0%
Functions: 152 218 69.7%
Branches: 97 103 94.2%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_YUV422_TO_RGB_SC_H
6 #define KLEIDICV_YUV422_TO_RGB_SC_H
7
8 #include <utility>
9
10 #include "kleidicv/conversions/yuv_to_rgb.h"
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/sve2.h"
13 #include "yuv42x_coefficients.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16 template <size_t b_idx, size_t u_chroma_idx, size_t y_idx, size_t dcn,
17 bool use_unpack_path>
18 class YUV422ToRGBxOrBGRx {
19 public:
20 // Byte offsets for chroma samples inside a 4-byte YUV422 tuple (Y0 U Y1 V).
21 static constexpr size_t u_idx = u_chroma_idx;
22 static constexpr size_t v_idx = (u_idx + 2) % 4;
23 // Source channel count (scn = 2) because YUV422 is interleaved with
24 // two channels per pixel on average: one luma (Y) and one shared
25 // chroma (U or V).
26 static constexpr size_t scn = 2;
27
28 2080 static kleidicv_error_t yuv2rgbx_operation(const uint8_t* src,
29 size_t src_stride, uint8_t* dst,
30 size_t dst_stride, size_t width,
31 size_t height) KLEIDICV_STREAMING {
32 // Keep track of the current output row being written.
33 2080 Rows<uint8_t> dst_rows{dst, dst_stride, dcn};
34 2080 auto kVectorLength = svcntb();
35
36 // Loop through rows along the image height.
37
44/48
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 2288 times.
✓ Branch 2 taken 159 times.
✓ Branch 3 taken 6888 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 24 times.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 24 times.
✓ Branch 12 taken 51 times.
✓ Branch 13 taken 2288 times.
✓ Branch 14 taken 156 times.
✓ Branch 15 taken 6888 times.
✓ Branch 16 taken 51 times.
✓ Branch 17 taken 2288 times.
✓ Branch 18 taken 156 times.
✓ Branch 19 taken 6888 times.
✓ Branch 20 taken 51 times.
✓ Branch 21 taken 2288 times.
✓ Branch 22 taken 156 times.
✓ Branch 23 taken 6888 times.
✓ Branch 24 taken 51 times.
✓ Branch 25 taken 2288 times.
✓ Branch 26 taken 156 times.
✓ Branch 27 taken 6888 times.
✓ Branch 28 taken 51 times.
✓ Branch 29 taken 2288 times.
✓ Branch 30 taken 156 times.
✓ Branch 31 taken 6888 times.
✓ Branch 32 taken 51 times.
✓ Branch 33 taken 2288 times.
✓ Branch 34 taken 156 times.
✓ Branch 35 taken 6888 times.
✓ Branch 36 taken 51 times.
✓ Branch 37 taken 2288 times.
✓ Branch 38 taken 156 times.
✓ Branch 39 taken 6888 times.
✓ Branch 40 taken 51 times.
✓ Branch 41 taken 2288 times.
✓ Branch 42 taken 156 times.
✓ Branch 43 taken 6888 times.
✓ Branch 44 taken 51 times.
✓ Branch 45 taken 2288 times.
✓ Branch 46 taken 156 times.
✓ Branch 47 taken 6888 times.
93888 for (size_t y = 0; y < height; y++, src += src_stride) {
38 91808 LoopUnroll2 loop{width, kVectorLength};
39
40 // Use loop.unroll_twice to process two pixels per iteration.
41 // In YUV422, two pixels are interleaved as (Y0, U0, Y1, V0).
42 // These four values produce two RGBx output pixels. By unrolling,
43 // we handle both pixels together in a single iteration, improving
44 // overall efficiency for that loop body.
45 struct UnrollTwiceFunctor {
46 const uint8_t* src_row;
47 Rows<uint8_t>& dst_rows;
48 size_t kVectorLength;
49
50 576 KLEIDICV_FORCE_INLINE void operator()(size_t index) const
51 KLEIDICV_STREAMING {
52 576 svbool_t pg = svptrue_b8();
53
54 // Deinterleave the YUV422 data into separate channels.
55 // svld4() loads 16 groups of 4 bytes: (Y0, U0, Y1, V0).
56 // Because we unroll twice, we must process two pixels at once.
57 // Each pixel contributes two components (Y + chroma), so 4 vectors
58 // are required: Y0, Y1, U, and V. This is why we perform 4 loads
59 // instead of 2 — they directly correspond to the unrolled iteration.
60 576 svuint8x4_t yuv422 = svld4(pg, src_row + index * scn);
61 576 svuint8_t y_even_lanes = svget4(yuv422, y_idx);
62 576 svuint8_t y_odd_lanes = svget4(yuv422, y_idx + scn);
63 576 svuint8_t u = svget4(yuv422, u_idx);
64 576 svuint8_t v = svget4(yuv422, v_idx);
65
66 // Convert two output vectors in one go (loop unrolled twice).
67 // The second destination pointer is advanced by kVectorLength * dcn:
68 // - kVectorLength: number of pixels produced per vector
69 // - dcn: destination channels per pixel (3 for RGB, 4 for RGBA)
70 // Because we emit two RGBx vectors per iteration, the second write
71 // starts exactly kVectorLength * dcn bytes after the first.
72 576 yuv422_to_rgb(
73 pg, y_even_lanes, y_odd_lanes, u, v,
74 576 dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)),
75 1152 dst_rows.as_columns().ptr_at(
76 576 static_cast<ptrdiff_t>(index + kVectorLength)),
77 pg, pg);
78 576 }
79 };
80
81 91808 loop.unroll_twice(UnrollTwiceFunctor{src, dst_rows, kVectorLength});
82
83 struct RemainingFunctor {
84 const uint8_t* src_row;
85 Rows<uint8_t>& dst_rows;
86 size_t kVectorLength;
87
88 91808 KLEIDICV_FORCE_INLINE void operator()(size_t index, size_t length) const
89 KLEIDICV_STREAMING {
90 91808 svbool_t pg = svwhilelt_b8_u64(index, length);
91 91808 svbool_t pg_st1 = svwhilelt_b8_u64(index, length);
92 91808 svbool_t pg_st2 = svwhilelt_b8_u64(index + kVectorLength, length);
93
94 183616 svuint8x4_t yuv422 = svld4(svwhilelt_b8_u64(0, (length - index) / 2),
95 91808 src_row + index * scn);
96
97 91808 svuint8_t y_even_lanes = svget4(yuv422, y_idx);
98 91808 svuint8_t y_odd_lanes = svget4(yuv422, y_idx + scn);
99 91808 svuint8_t u = svget4(yuv422, u_idx);
100 91808 svuint8_t v = svget4(yuv422, v_idx);
101
102 // Convert two output vectors in one go (loop unrolled twice).
103 // The second destination pointer is advanced by kVectorLength * dcn:
104 // - kVectorLength: number of pixels produced per vector
105 // - dcn: destination channels per pixel (3 for RGB, 4 for RGBA)
106 // Because we emit two RGBx vectors per iteration, the second write
107 // starts exactly kVectorLength * dcn bytes after the first.
108 91808 yuv422_to_rgb(
109 pg, y_even_lanes, y_odd_lanes, u, v,
110 91808 dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)),
111 183616 dst_rows.as_columns().ptr_at(
112 91808 static_cast<ptrdiff_t>(index + kVectorLength)),
113 pg_st1, pg_st2);
114 91808 }
115 };
116
117 91808 loop.remaining(RemainingFunctor{src, dst_rows, kVectorLength});
118
119 91808 ++dst_rows;
120 91808 }
121 2080 return KLEIDICV_OK;
122 2080 }
123
124 private:
125 // Convert two blocks of YUV422 (deinterleaved) data into RGBx color format.
126 // Each block contains 16 Y values (y0, y1) plus shared U and V values.
127 // The function computes R, G, B channels, normalizes, and stores
128 // results either as RGB (3 channels) or RGBA (4 channels).
129 KLEIDICV_FORCE_INLINE
130 92384 static void yuv422_to_rgb(svbool_t& pg, const svuint8_t& y_even_lanes,
131 const svuint8_t& y_odd_lanes, const svuint8_t& u,
132 const svuint8_t& v, uint8_t* rgbx0, uint8_t* rgbx1,
133 svbool_t& pg_st1,
134 svbool_t& pg_st2) KLEIDICV_STREAMING {
135 // --- Preprocess Y channel ---
136 // Subtract 16 from luma (Y') with saturation and widen later to 32 bits.
137 92384 svuint8_t y_even_lanes_m16 = svqsub(y_even_lanes, static_cast<uint8_t>(16));
138 92384 svuint8_t y_odd_lanes_m16 = svqsub(y_odd_lanes, static_cast<uint8_t>(16));
139
140 // Expand Y values into 32-bit lanes for later arithmetic.
141 // Note: "even" and "odd" refer to the pixel position in the YUV422 packing,
142 // not the Y component itself.
143 //
144 // In YUV422, pixels are stored as (Y0, U0, Y1, V0).
145 // - The "even" vectors collect Y0, Y2, Y4, ... → these generate the
146 // even-positioned RGB outputs.
147 // - The "odd" vectors collect Y1, Y3, Y5, ... → these generate the
148 // odd-positioned RGB outputs.
149 //
150 // How it works here:
151 // 1. Split Y into low/high halves using svmovlb()/svmovlt().
152 // 2. Widen each half to 32-bit lanes with svunpklo()/svunpkhi().
153 //
154 // Why this may look unusual:
155 // - At first glance, you might expect “_lo_hi” to come from *_hi, but
156 // that would require extra moves and shuffles.
157 // - Current scheme uses only 2× svmov + 4× svunpk per group, which is
158 // efficient since the pipeline can issue two svunpk in parallel.
159 // - A more “intuitive” pairing would need 4× svmov + 2× svunpk, which is
160 // slower because svmov has less bundling freedom.
161 // - Using svmovlb/svmovlt also aligns lanes so later narrowing can run
162 // without additional shuffles, improving overall performance.
163 92384 svint32_t y_even_lo_lo{}, y_even_lo_hi{}, y_even_hi_lo{}, y_even_hi_hi{};
164 92384 svint32_t y_odd_lo_lo{}, y_odd_lo_hi{}, y_odd_hi_lo{}, y_odd_hi_hi{};
165 // Expand U and V into 32-bit lanes (shared chroma).
166 // In YUV422, each U and V value is shared by a pair of pixels:
167 // (Y_even, U, Y_odd, V)
168 // Therefore, the same U and V vectors are used when computing both
169 // the "even" and "odd" RGB outputs.
170 92384 svint32_t v_lo_lo{}, v_lo_hi{}, v_hi_lo{}, v_hi_hi{};
171 92384 svint32_t u_lo_lo{}, u_lo_hi{}, u_hi_lo{}, u_hi_hi{};
172 if constexpr (use_unpack_path) {
173 22880 svuint16_t y_even_lo = svmovlb(y_even_lanes_m16);
174 22880 svuint16_t y_even_hi = svmovlt(y_even_lanes_m16);
175 22880 svuint16_t y_odd_lo = svmovlb(y_odd_lanes_m16);
176 22880 svuint16_t y_odd_hi = svmovlt(y_odd_lanes_m16);
177 22880 y_even_lo_lo = svreinterpret_s32(svunpklo(y_even_lo));
178 22880 y_even_lo_hi = svreinterpret_s32(svunpklo(y_even_hi));
179 22880 y_even_hi_lo = svreinterpret_s32(svunpkhi(y_even_lo));
180 22880 y_even_hi_hi = svreinterpret_s32(svunpkhi(y_even_hi));
181 22880 y_odd_lo_lo = svreinterpret_s32(svunpklo(y_odd_lo));
182 22880 y_odd_lo_hi = svreinterpret_s32(svunpklo(y_odd_hi));
183 22880 y_odd_hi_lo = svreinterpret_s32(svunpkhi(y_odd_lo));
184 22880 y_odd_hi_hi = svreinterpret_s32(svunpkhi(y_odd_hi));
185
186 22880 svuint16_t v_lo = svmovlb(v);
187 22880 svuint16_t v_hi = svmovlt(v);
188 22880 svuint16_t u_lo = svmovlb(u);
189 22880 svuint16_t u_hi = svmovlt(u);
190 22880 v_lo_lo = svreinterpret_s32(svunpklo(v_lo));
191 22880 v_lo_hi = svreinterpret_s32(svunpklo(v_hi));
192 22880 v_hi_lo = svreinterpret_s32(svunpkhi(v_lo));
193 22880 v_hi_hi = svreinterpret_s32(svunpkhi(v_hi));
194 22880 u_lo_lo = svreinterpret_s32(svunpklo(u_lo));
195 22880 u_lo_hi = svreinterpret_s32(svunpklo(u_hi));
196 22880 u_hi_lo = svreinterpret_s32(svunpkhi(u_lo));
197 22880 u_hi_hi = svreinterpret_s32(svunpkhi(u_hi));
198 22880 } else {
199 139008 svuint8_t index0 = svreinterpret_u8_u32(
200 69504 svindex_u32(0xFFFFFF00, 0x0002)); // 0, 2, 4, 6, ...
201 139008 svuint8_t index1 = svreinterpret_u8_u32(
202 69504 svindex_u32(0xFFFFFF00 + svcnth(), 0x0002)); // 8, 10, 12, 14, ...
203 139008 svuint8_t index2 = svreinterpret_u8_u32(
204 69504 svindex_u32(0xFFFFFF01, 0x0002)); // 1, 3, 5, 7, ...
205 139008 svuint8_t index3 = svreinterpret_u8_u32(
206 69504 svindex_u32(0xFFFFFF01 + svcnth(), 0x0002)); // 9, 11, 13, 15, ...
207
208 69504 y_even_lo_lo = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index0));
209 69504 y_even_lo_hi = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index2));
210 69504 y_even_hi_lo = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index1));
211 69504 y_even_hi_hi = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index3));
212 69504 y_odd_lo_lo = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index0));
213 69504 y_odd_lo_hi = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index2));
214 69504 y_odd_hi_lo = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index1));
215 69504 y_odd_hi_hi = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index3));
216
217 69504 v_lo_lo = svreinterpret_s32(svtbl_u8(v, index0));
218 69504 v_lo_hi = svreinterpret_s32(svtbl_u8(v, index2));
219 69504 v_hi_lo = svreinterpret_s32(svtbl_u8(v, index1));
220 69504 v_hi_hi = svreinterpret_s32(svtbl_u8(v, index3));
221 69504 u_lo_lo = svreinterpret_s32(svtbl_u8(u, index0));
222 69504 u_lo_hi = svreinterpret_s32(svtbl_u8(u, index2));
223 69504 u_hi_lo = svreinterpret_s32(svtbl_u8(u, index1));
224 69504 u_hi_hi = svreinterpret_s32(svtbl_u8(u, index3));
225 69504 }
226
227 // Scale the Y (luma) values by the fixed coefficient kYWeight.
228 // This produces the weighted luma contribution (Y') that forms the
229 // base term for all R, G, and B channel calculations in the
230 // YUV → RGB conversion.
231 92384 y_even_lo_lo = svmul_x(pg, y_even_lo_lo, kYWeight);
232 92384 y_even_lo_hi = svmul_x(pg, y_even_lo_hi, kYWeight);
233 92384 y_even_hi_lo = svmul_x(pg, y_even_hi_lo, kYWeight);
234 92384 y_even_hi_hi = svmul_x(pg, y_even_hi_hi, kYWeight);
235 92384 y_odd_lo_lo = svmul_x(pg, y_odd_lo_lo, kYWeight);
236 92384 y_odd_lo_hi = svmul_x(pg, y_odd_lo_hi, kYWeight);
237 92384 y_odd_hi_lo = svmul_x(pg, y_odd_hi_lo, kYWeight);
238 92384 y_odd_hi_hi = svmul_x(pg, y_odd_hi_hi, kYWeight);
239
240 // Precompute constant base offsets for R, G, and B channels.
241 // These include the rounding term (1 << (kWeightScale - 1)) and the
242 // bias correction for centering U and V around 128.
243 // This ensures that chroma values (U,V) are properly zero-based before
244 // applying their respective weighting factors in the YUV → RGB formulas.
245 92384 constexpr int32_t kOffset = 1 << (kWeightScale - 1);
246 92384 svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]);
247 184768 svint32_t g_base =
248 92384 svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2]));
249 92384 svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]);
250
251 // --- Compute the Red channel ---
252 // Formula: R = Y + (kRV * V) + bias
253 // - Start with r_base_ (rounding + bias correction for V centered at 128).
254 // - Multiply V by kUVWeights[kRVWeightIndex] and add the result to r_base_.
255 // - Reuse the same V contribution for both even and odd pixels, since
256 // chroma is shared in YUV422.
257 // - Finally, add the weighted Y values (even and odd) to produce
258 // the full R channel before normalization and packing to 8 bits.
259 184768 svint32_t r_even_lo_lo =
260 92384 svmla_x(pg, r_base, v_lo_lo, kUVWeights[kRVWeightIndex]);
261 184768 svint32_t r_even_lo_hi =
262 92384 svmla_x(pg, r_base, v_lo_hi, kUVWeights[kRVWeightIndex]);
263 184768 svint32_t r_even_hi_lo =
264 92384 svmla_x(pg, r_base, v_hi_lo, kUVWeights[kRVWeightIndex]);
265 184768 svint32_t r_even_hi_hi =
266 92384 svmla_x(pg, r_base, v_hi_hi, kUVWeights[kRVWeightIndex]);
267
268 // Re-interleave and pack the Red channel to u8.
269 // We computed R in four 4-lane chunks split by pixel parity:
270 // r_even_lo_lo (even pixels 0..3), r_even_lo_hi (even 4..7)
271 // r_even_hi_lo (even pixels 8..11), r_even_hi_hi (even 12..15)
272 // The same chroma sums drive the odd pixels, so we reuse the r_even_*
273 // vectors when adding the odd Y lanes below.
274 // normalize_and_pack(...) saturates → shifts → narrows s32→u8 *and*
275 // interleaves even/odd so the output is in raster order:
276 // [R0, R1, R2, R3, ...] (i.e., even0, odd0, even1, odd1, ...).
277 // r0 packs the first 16 R samples; r1 packs the next 16, which come from
278 // the *_hi_* groups.
279 92384 svint16_t r0_even = svaddhnb(r_even_lo_lo, y_even_lo_lo);
280 92384 r0_even = svaddhnt(r0_even, r_even_lo_hi, y_even_lo_hi);
281 92384 r0_even = svsra(svdup_n_s16(0), r0_even, kWeightScale - 16);
282 92384 svint16_t r0_odd = svaddhnb(r_even_lo_lo, y_odd_lo_lo);
283 92384 r0_odd = svaddhnt(r0_odd, r_even_lo_hi, y_odd_lo_hi);
284 92384 r0_odd = svsra(svdup_n_s16(0), r0_odd, kWeightScale - 16);
285 92384 svuint8_t r0 = svqxtunt(svqxtunb(r0_even), r0_odd);
286
287 92384 svint16_t r1_even = svaddhnb(r_even_hi_lo, y_even_hi_lo);
288 92384 r1_even = svaddhnt(r1_even, r_even_hi_hi, y_even_hi_hi);
289 92384 r1_even = svsra(svdup_n_s16(0), r1_even, kWeightScale - 16);
290 92384 svint16_t r1_odd = svaddhnb(r_even_hi_lo, y_odd_hi_lo);
291 92384 r1_odd = svaddhnt(r1_odd, r_even_hi_hi, y_odd_hi_hi);
292 92384 r1_odd = svsra(svdup_n_s16(0), r1_odd, kWeightScale - 16);
293 92384 svuint8_t r1 = svqxtunt(svqxtunb(r1_even), r1_odd);
294
295 // --- Compute the Green channel ---
296 // Formula: G = Y + (kGU * U + kGV * V) + bias, reusing the shared chroma
297 // samples for both pixels in each YUV422 pair before normalize/pack
298 // interleaves them back into raster order.
299 184768 svint32_t g_even_lo_lo =
300 92384 svmla_x(pg, g_base, u_lo_lo, kUVWeights[kGUWeightIndex]);
301 184768 svint32_t g_even_lo_hi =
302 92384 svmla_x(pg, g_base, u_lo_hi, kUVWeights[kGUWeightIndex]);
303 184768 svint32_t g_even_hi_lo =
304 92384 svmla_x(pg, g_base, u_hi_lo, kUVWeights[kGUWeightIndex]);
305 184768 svint32_t g_even_hi_hi =
306 92384 svmla_x(pg, g_base, u_hi_hi, kUVWeights[kGUWeightIndex]);
307 92384 g_even_lo_lo =
308 92384 svmla_x(pg, g_even_lo_lo, v_lo_lo, kUVWeights[kGVWeightIndex]);
309 92384 g_even_lo_hi =
310 92384 svmla_x(pg, g_even_lo_hi, v_lo_hi, kUVWeights[kGVWeightIndex]);
311 92384 g_even_hi_lo =
312 92384 svmla_x(pg, g_even_hi_lo, v_hi_lo, kUVWeights[kGVWeightIndex]);
313 92384 g_even_hi_hi =
314 92384 svmla_x(pg, g_even_hi_hi, v_hi_hi, kUVWeights[kGVWeightIndex]);
315
316 92384 svint16_t g0_even = svaddhnb(g_even_lo_lo, y_even_lo_lo);
317 92384 g0_even = svaddhnt(g0_even, g_even_lo_hi, y_even_lo_hi);
318 92384 g0_even = svsra(svdup_n_s16(0), g0_even, kWeightScale - 16);
319 // Same rationale as for Red: reuse the g_even_* chroma base when adding
320 // the odd Y lanes, avoiding redundant temporaries.
321 92384 svint16_t g0_odd = svaddhnb(g_even_lo_lo, y_odd_lo_lo);
322 92384 g0_odd = svaddhnt(g0_odd, g_even_lo_hi, y_odd_lo_hi);
323 92384 g0_odd = svsra(svdup_n_s16(0), g0_odd, kWeightScale - 16);
324 92384 svuint8_t g0 = svqxtunt(svqxtunb(g0_even), g0_odd);
325
326 92384 svint16_t g1_even = svaddhnb(g_even_hi_lo, y_even_hi_lo);
327 92384 g1_even = svaddhnt(g1_even, g_even_hi_hi, y_even_hi_hi);
328 92384 g1_even = svsra(svdup_n_s16(0), g1_even, kWeightScale - 16);
329 92384 svint16_t g1_odd = svaddhnb(g_even_hi_lo, y_odd_hi_lo);
330 92384 g1_odd = svaddhnt(g1_odd, g_even_hi_hi, y_odd_hi_hi);
331 92384 g1_odd = svsra(svdup_n_s16(0), g1_odd, kWeightScale - 16);
332 92384 svuint8_t g1 = svqxtunt(svqxtunb(g1_even), g1_odd);
333
334 // --- Compute the Blue channel ---
335 // Formula: B = Y + (kBU * U) + bias, sharing the U samples across each
336 // even/odd pair before normalize/pack interleaves the results.
337 184768 svint32_t b_even_lo_lo =
338 92384 svmla_x(pg, b_base, u_lo_lo, kUVWeights[kBUWeightIndex]);
339 184768 svint32_t b_even_lo_hi =
340 92384 svmla_x(pg, b_base, u_lo_hi, kUVWeights[kBUWeightIndex]);
341 184768 svint32_t b_even_hi_lo =
342 92384 svmla_x(pg, b_base, u_hi_lo, kUVWeights[kBUWeightIndex]);
343 184768 svint32_t b_even_hi_hi =
344 92384 svmla_x(pg, b_base, u_hi_hi, kUVWeights[kBUWeightIndex]);
345
346 92384 svint16_t b0_even = svaddhnb(b_even_lo_lo, y_even_lo_lo);
347 92384 b0_even = svaddhnt(b0_even, b_even_lo_hi, y_even_lo_hi);
348 92384 b0_even = svsra(svdup_n_s16(0), b0_even, kWeightScale - 16);
349 // Blue follows the same pattern, so reuse the b_even_* vectors for odd Y.
350 92384 svint16_t b0_odd = svaddhnb(b_even_lo_lo, y_odd_lo_lo);
351 92384 b0_odd = svaddhnt(b0_odd, b_even_lo_hi, y_odd_lo_hi);
352 92384 b0_odd = svsra(svdup_n_s16(0), b0_odd, kWeightScale - 16);
353 92384 svuint8_t b0 = svqxtunt(svqxtunb(b0_even), b0_odd);
354
355 92384 svint16_t b1_even = svaddhnb(b_even_hi_lo, y_even_hi_lo);
356 92384 b1_even = svaddhnt(b1_even, b_even_hi_hi, y_even_hi_hi);
357 92384 b1_even = svsra(svdup_n_s16(0), b1_even, kWeightScale - 16);
358 92384 svint16_t b1_odd = svaddhnb(b_even_hi_lo, y_odd_hi_lo);
359 92384 b1_odd = svaddhnt(b1_odd, b_even_hi_hi, y_odd_hi_hi);
360 92384 b1_odd = svsra(svdup_n_s16(0), b1_odd, kWeightScale - 16);
361 92384 svuint8_t b1 = svqxtunt(svqxtunb(b1_even), b1_odd);
362
363 if constexpr (dcn > 3) {
364 110688 svuint8x4_t rgba0 =
365 55344 svcreate4(b_idx ? r0 : b0, g0, b_idx ? b0 : r0, svdup_n_u8(0xFF));
366 110688 svuint8x4_t rgba1 =
367 55344 svcreate4(b_idx ? r1 : b1, g1, b_idx ? b1 : r1, svdup_n_u8(0xFF));
368 // Store RGBA pixels to memory.
369 55344 svst4_u8(pg_st1, rgbx0, rgba0);
370 55344 svst4_u8(pg_st2, rgbx1, rgba1);
371 55344 } else {
372 37040 svuint8x3_t rgb0 = svcreate3(b_idx ? r0 : b0, g0, b_idx ? b0 : r0);
373 37040 svuint8x3_t rgb1 = svcreate3(b_idx ? r1 : b1, g1, b_idx ? b1 : r1);
374 // Store RGB pixels to memory.
375 37040 svst3(pg_st1, rgbx0, rgb0);
376 37040 svst3(pg_st2, rgbx1, rgb1);
377 37040 }
378 92384 }
379 };
380
381 template <size_t b_idx, size_t u_chroma_idx, size_t y_idx, size_t dcn>
382 2080 kleidicv_error_t dispatch_yuv422_to_rgb(bool use_unpack_path,
383 const uint8_t* src, size_t src_stride,
384 uint8_t* dst, size_t dst_stride,
385 size_t width, size_t height) {
386
22/24
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 159 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 51 times.
✓ Branch 7 taken 156 times.
✓ Branch 8 taken 51 times.
✓ Branch 9 taken 156 times.
✓ Branch 10 taken 51 times.
✓ Branch 11 taken 156 times.
✓ Branch 12 taken 51 times.
✓ Branch 13 taken 156 times.
✓ Branch 14 taken 51 times.
✓ Branch 15 taken 156 times.
✓ Branch 16 taken 51 times.
✓ Branch 17 taken 156 times.
✓ Branch 18 taken 51 times.
✓ Branch 19 taken 156 times.
✓ Branch 20 taken 51 times.
✓ Branch 21 taken 156 times.
✓ Branch 22 taken 51 times.
✓ Branch 23 taken 156 times.
2080 if (use_unpack_path) {
387 511 return YUV422ToRGBxOrBGRx<b_idx, u_chroma_idx, y_idx, dcn,
388 511 true>::yuv2rgbx_operation(src, src_stride, dst,
389 511 dst_stride, width,
390 511 height);
391 }
392 1569 return YUV422ToRGBxOrBGRx<b_idx, u_chroma_idx, y_idx, dcn,
393 1569 false>::yuv2rgbx_operation(src, src_stride, dst,
394 1569 dst_stride, width,
395 1569 height);
396 2080 }
397
398 KLEIDICV_TARGET_FN_ATTRS
399 2220 static kleidicv_error_t yuv422_to_rgb_u8_sc(
400 const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride,
401 size_t width, size_t height,
402 kleidicv_color_conversion_t color_format) KLEIDICV_STREAMING {
403
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2217 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 2217 times.
2220 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
404
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2214 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 2214 times.
2217 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
405
6/6
✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2203 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 2196 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 2196 times.
2214 CHECK_IMAGE_SIZE(width, height);
406
407 // YUV422 packs pixels in pairs: (Y0, U, Y1, V).
408 // Therefore, the image width must be at least 2 and always even.
409
4/4
✓ Branch 0 taken 2189 times.
✓ Branch 1 taken 7 times.
✓ Branch 2 taken 39 times.
✓ Branch 3 taken 2150 times.
2196 if (width < 2 || (width % 2) != 0) {
410 46 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
411 }
412 2150 bool use_unpack_path = KLEIDICV_UNLIKELY(svcntb() >= 256);
413
13/13
✓ Branch 0 taken 207 times.
✓ Branch 1 taken 207 times.
✓ Branch 2 taken 70 times.
✓ Branch 3 taken 211 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 207 times.
✓ Branch 7 taken 207 times.
✓ Branch 8 taken 207 times.
✓ Branch 9 taken 207 times.
✓ Branch 10 taken 207 times.
✓ Branch 11 taken 207 times.
✓ Branch 12 taken 207 times.
2150 switch (color_format) {
414 case KLEIDICV_YUYV_TO_BGR:
415 211 return dispatch_yuv422_to_rgb<0, 1, 0, 3>(
416 211 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
417 break;
418 case KLEIDICV_UYVY_TO_BGR:
419 3 return dispatch_yuv422_to_rgb<0, 0, 1, 3>(
420 3 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
421 break;
422 case KLEIDICV_YVYU_TO_BGR:
423 3 return dispatch_yuv422_to_rgb<0, 3, 0, 3>(
424 3 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
425 break;
426 case KLEIDICV_YUYV_TO_RGB:
427 207 return dispatch_yuv422_to_rgb<2, 1, 0, 3>(
428 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
429 break;
430 case KLEIDICV_UYVY_TO_RGB:
431 207 return dispatch_yuv422_to_rgb<2, 0, 1, 3>(
432 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
433 break;
434 case KLEIDICV_YVYU_TO_RGB:
435 207 return dispatch_yuv422_to_rgb<2, 3, 0, 3>(
436 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
437 break;
438 case KLEIDICV_YUYV_TO_BGRA:
439 207 return dispatch_yuv422_to_rgb<0, 1, 0, 4>(
440 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
441 break;
442 case KLEIDICV_UYVY_TO_BGRA:
443 207 return dispatch_yuv422_to_rgb<0, 0, 1, 4>(
444 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
445 break;
446 case KLEIDICV_YVYU_TO_BGRA:
447 207 return dispatch_yuv422_to_rgb<0, 3, 0, 4>(
448 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
449 break;
450 case KLEIDICV_YUYV_TO_RGBA:
451 207 return dispatch_yuv422_to_rgb<2, 1, 0, 4>(
452 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
453 break;
454 case KLEIDICV_UYVY_TO_RGBA:
455 207 return dispatch_yuv422_to_rgb<2, 0, 1, 4>(
456 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
457 break;
458 case KLEIDICV_YVYU_TO_RGBA:
459 207 return dispatch_yuv422_to_rgb<2, 3, 0, 4>(
460 207 use_unpack_path, src, src_stride, dst, dst_stride, width, height);
461 break;
462 default:
463 70 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
464 break;
465 }
466 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
467 2220 }
468
469 } // namespace KLEIDICV_TARGET_NAMESPACE
470
471 #endif // KLEIDICV_YUV422_TO_RGB_SC_H
472