KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv422_to_rgb_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 240 240 100.0%
Functions: 73 73 100.0%
Branches: 103 103 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <utility>
6
7 #include "kleidicv/conversions/yuv_to_rgb.h"
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/neon.h"
10 #include "yuv42x_coefficients.h"
11
12 namespace kleidicv::neon {
13 template <size_t b_idx, size_t u_chroma_idx, size_t y_idx, size_t dcn>
14 class YUV422ToRGBxOrBGRx {
15 public:
16 // Byte offsets for chroma samples inside a 4-byte YUV422 tuple (Y0 U Y1 V).
17 static constexpr size_t u_idx = u_chroma_idx;
18 static constexpr size_t v_idx = (u_idx + 2) % 4;
19 // Source channel count (scn = 2) because YUV422 is interleaved with
20 // two channels per pixel on average: one luma (Y) and one shared
21 // chroma (U or V).
22 static constexpr size_t scn = 2;
23
24 523 static kleidicv_error_t yuv2rgbx_operation(const uint8_t* src,
25 size_t src_stride, uint8_t* dst,
26 size_t dst_stride, size_t width,
27 size_t height) {
28 523 Rows<uint8_t> dst_rows{dst, dst_stride, dcn};
29
30
24/24
✓ Branch 0 taken 53 times.
✓ Branch 1 taken 2296 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 2296 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 2296 times.
✓ Branch 10 taken 52 times.
✓ Branch 11 taken 2296 times.
✓ Branch 12 taken 52 times.
✓ Branch 13 taken 2296 times.
✓ Branch 14 taken 52 times.
✓ Branch 15 taken 2296 times.
✓ Branch 16 taken 52 times.
✓ Branch 17 taken 2296 times.
✓ Branch 18 taken 52 times.
✓ Branch 19 taken 2296 times.
✓ Branch 20 taken 52 times.
✓ Branch 21 taken 2296 times.
✓ Branch 22 taken 52 times.
✓ Branch 23 taken 2296 times.
23499 for (size_t y = 0; y < height; y++, src += src_stride) {
31 22976 LoopUnroll2 loop{width, kVectorLength};
32
33 // Use loop.unroll_twice to process two pixels per iteration.
34 // In YUV422, two pixels are interleaved as (Y0, U0, Y1, V0).
35 // These four values produce two RGBx output pixels. By unrolling,
36 // we handle both pixels together in a single iteration, improving
37 // overall efficiency for that loop body.
38 struct UnrollTwiceFunctor {
39 const uint8_t* src_row;
40 Rows<uint8_t>& dst_rows;
41
42 384 KLEIDICV_FORCE_INLINE void operator()(size_t index) const {
43 // Deinterleave the YUV422 data into separate channels.
44 // vld4q_u8() loads 16 groups of 4 bytes: (Y0, U0, Y1, V0).
45 // Because we unroll twice, we must process two pixels at once.
46 // Each pixel contributes two components (Y + chroma), so 4 vectors
47 // are required: Y0, Y1, U, and V. This is why we perform 4 loads
48 // instead of 2 — they directly correspond to the unrolled iteration.
49 384 uint8x16x4_t yuv422 = vld4q_u8(src_row + index * scn);
50 384 uint8x16_t y_even_lanes = yuv422.val[y_idx];
51 384 uint8x16_t y_odd_lanes = yuv422.val[y_idx + scn];
52 384 uint8x16_t u = yuv422.val[u_idx];
53 384 uint8x16_t v = yuv422.val[v_idx];
54 // Convert two output vectors in one go (loop unrolled twice).
55 // The second destination pointer is advanced by kVectorLength * dcn:
56 // - kVectorLength: number of pixels produced per vector
57 // - dcn: destination channels per pixel (3 for RGB, 4 for RGBA)
58 // Because we emit two RGBx vectors per iteration, the second write
59 // starts exactly kVectorLength * dcn bytes after the first.
60 384 yuv422_to_rgb(
61 y_even_lanes, y_odd_lanes, u, v,
62 384 dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)),
63 768 dst_rows.as_columns().ptr_at(
64 384 static_cast<ptrdiff_t>(index + kVectorLength)));
65 384 }
66 };
67 22976 loop.unroll_twice(UnrollTwiceFunctor{src, dst_rows});
68
69 // Scalar loop over YUV422 pixels.
70 struct RemainingFunctor {
71 const uint8_t* src_row;
72 Rows<uint8_t>& dst_rows;
73
74 22976 KLEIDICV_FORCE_INLINE void operator()(size_t index,
75 size_t length) const {
76
24/24
✓ Branch 0 taken 3550 times.
✓ Branch 1 taken 2296 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 3550 times.
✓ Branch 7 taken 2296 times.
✓ Branch 8 taken 3550 times.
✓ Branch 9 taken 2296 times.
✓ Branch 10 taken 3550 times.
✓ Branch 11 taken 2296 times.
✓ Branch 12 taken 3550 times.
✓ Branch 13 taken 2296 times.
✓ Branch 14 taken 3550 times.
✓ Branch 15 taken 2296 times.
✓ Branch 16 taken 3550 times.
✓ Branch 17 taken 2296 times.
✓ Branch 18 taken 3550 times.
✓ Branch 19 taken 2296 times.
✓ Branch 20 taken 3550 times.
✓ Branch 21 taken 2296 times.
✓ Branch 22 taken 3550 times.
✓ Branch 23 taken 2296 times.
58492 for (; index < length; index += 2) {
77 35516 const uint8_t u = src_row[(index * scn) + u_idx];
78 35516 const uint8_t v = src_row[(index * scn) + v_idx];
79 35516 const uint8_t y0 = src_row[(index * scn) + y_idx];
80 35516 const uint8_t y1 = src_row[(index * scn) + y_idx + scn];
81
82 35516 const int32_t u_m128 = static_cast<int32_t>(u) - 128;
83 35516 const int32_t v_m128 = static_cast<int32_t>(v) - 128;
84
85 35516 const uint8_t y_rows[2] = {y0, y1};
86 106548 uint8_t* rgbx_rows[2] = {
87 35516 dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)),
88 71032 dst_rows.as_columns().ptr_at(
89 35516 static_cast<ptrdiff_t>(index + 1))};
90
91 35516 yuv422_to_rgb(y_rows, u_m128, v_m128, rgbx_rows);
92 35516 }
93 22976 }
94 };
95 22976 loop.remaining(RemainingFunctor{src, dst_rows});
96
97 22976 ++dst_rows;
98 22976 }
99
100 523 return KLEIDICV_OK;
101 523 }
102
103 private:
104 KLEIDICV_FORCE_INLINE
105 2304 static uint8x16_t normalize_and_pack(int32x4_t vec_0, int32x4_t vec_1,
106 int32x4_t vec_2, int32x4_t vec_3) {
107 2304 int16x4_t tmp_lo_lo = vqshrun_n_s32(vec_0, kWeightScale - 8);
108 4608 int16x8_t tmp_lo_hi =
109 2304 vqshrun_high_n_s32(tmp_lo_lo, vec_1, kWeightScale - 8);
110 2304 int16x4_t tmp_hi_lo = vqshrun_n_s32(vec_2, kWeightScale - 8);
111 4608 int16x8_t tmp_hi_hi =
112 2304 vqshrun_high_n_s32(tmp_hi_lo, vec_3, kWeightScale - 8);
113 4608 uint8x16_t output =
114 2304 vtrn2q_u8(vreinterpretq_u8(tmp_lo_hi), vreinterpretq_u8(tmp_hi_hi));
115 4608 return output;
116 2304 }
117 // Convert two blocks of YUV422 (deinterleaved) data into RGBx color format.
118 // Each block contains 16 Y values (y_even_lanes, y_odd_lanes) plus shared U
119 // and V values. The function computes R, G, B channels, normalizes, and
120 // stores results either as RGB (3 channels) or RGBA (4 channels).
121 KLEIDICV_FORCE_INLINE
122 384 static void yuv422_to_rgb(const uint8x16_t& y_even_lanes,
123 const uint8x16_t& y_odd_lanes, const uint8x16_t& u,
124 const uint8x16_t& v, uint8_t* rgbx0,
125 uint8_t* rgbx1) {
126 // --- Preprocess Y channel ---
127 // Subtract 16 from luma (Y') with saturation and widen later to 32 bits.
128 384 uint8x16_t y_even_lanes_m16 = vqsubq_u8(y_even_lanes, vdupq_n_u8(16));
129 384 uint8x16_t y_odd_lanes_m16 = vqsubq_u8(y_odd_lanes, vdupq_n_u8(16));
130
131 // --- Zero-extend (8 → 32) via table lookups ---
132 // The masks feed vqtbl1q_u8 so each lookup pulls 4 bytes out of a 16-lane
133 // u8 vector and places the selected byte as the least-significant byte of a
134 // 32-bit lane while zeroing the remaining three bytes.
135 // vqtbl1q_u8 inserts 0 for indices ≥ 16 (e.g., 0xFF), letting us build
136 // [x,0,0,0] groups that we reinterpret as int32x4_t to get u8→s32 lanes in
137 // one step.
138 384 const uint8x16_t index_0 = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff,
139 2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff};
140 384 const uint8x16_t index_1 = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff,
141 6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff};
142 384 const uint8x16_t index_2 = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff,
143 10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff};
144 384 const uint8x16_t index_3 = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff,
145 14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
146
147 // Expand Y values into 32-bit lanes for later arithmetic.
148 // Note: "even" and "odd" describe the pixel position in the YUV422 packing,
149 // not the Y component itself.
150 //
151 // In YUV422, pixels are stored as (Y0, U0, Y1, V0).
152 // - The "even" vectors collect Y0, Y2, Y4, ... → these generate the
153 // even-positioned RGB outputs.
154 // - The "odd" vectors collect Y1, Y3, Y5, ... → these generate the
155 // odd-positioned RGB outputs.
156 768 int32x4_t y_even_lo_lo =
157 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_0));
158 768 int32x4_t y_even_lo_hi =
159 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_1));
160 768 int32x4_t y_even_hi_lo =
161 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_2));
162 768 int32x4_t y_even_hi_hi =
163 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_3));
164 768 int32x4_t y_odd_lo_lo =
165 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_0));
166 768 int32x4_t y_odd_lo_hi =
167 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_1));
168 768 int32x4_t y_odd_hi_lo =
169 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_2));
170 768 int32x4_t y_odd_hi_hi =
171 384 vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_3));
172
173 // Expand U and V into 32-bit lanes (shared chroma).
174 // In YUV422, each U and V value is shared by a pair of pixels:
175 // (Y_even, U, Y_odd, V)
176 // Therefore, the same U and V vectors are used when computing both
177 // the "even" and "odd" RGB outputs.
178 384 int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_0));
179 384 int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_1));
180 384 int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_2));
181 384 int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_3));
182 384 int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_0));
183 384 int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_1));
184 384 int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_2));
185 384 int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_3));
186
187 // Scale the Y (luma) values by the fixed coefficient kYWeight.
188 // This produces the weighted luma contribution (Y') that forms the
189 // base term for all R, G, and B channel calculations in the
190 // YUV → RGB conversion.
191 384 y_even_lo_lo = vmulq_n_s32(y_even_lo_lo, kYWeight);
192 384 y_even_lo_hi = vmulq_n_s32(y_even_lo_hi, kYWeight);
193 384 y_even_hi_lo = vmulq_n_s32(y_even_hi_lo, kYWeight);
194 384 y_even_hi_hi = vmulq_n_s32(y_even_hi_hi, kYWeight);
195 384 y_odd_lo_lo = vmulq_n_s32(y_odd_lo_lo, kYWeight);
196 384 y_odd_lo_hi = vmulq_n_s32(y_odd_lo_hi, kYWeight);
197 384 y_odd_hi_lo = vmulq_n_s32(y_odd_hi_lo, kYWeight);
198 384 y_odd_hi_hi = vmulq_n_s32(y_odd_hi_hi, kYWeight);
199
200 // Precompute constant base offsets for R, G, and B channels.
201 // These include the rounding term (1 << (kWeightScale - 1)) and the
202 // bias correction for centering U and V around 128.
203 // This ensures that chroma values (U,V) are properly zero-based before
204 // applying their respective weighting factors in the YUV → RGB formulas.
205 768 int32x4_t r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
206 384 128 * kUVWeights[kRVWeightIndex])};
207 768 int32x4_t g_base_{vdupq_n_s32(
208 384 (1 << (kWeightScale - 1)) -
209 384 128 * (kUVWeights[kGUWeightIndex] + kUVWeights[kGVWeightIndex]))};
210 768 int32x4_t b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
211 384 128 * kUVWeights[kBUWeightIndex])};
212
213 // --- Compute the Red channel ---
214 // Formula: R = Y + (kRV * V) + bias
215 // - Start with r_base_ (rounding + bias correction for V centered at 128).
216 // - Multiply V by kUVWeights[kRVWeightIndex] and add the result to r_base_.
217 // - Reuse the same V contribution for both even and odd pixels, since
218 // chroma is shared in YUV422.
219 // - Finally, add the weighted Y values (even and odd) to produce
220 // the full R channel before normalization and packing to 8 bits.
221 768 int32x4_t r_even_lo_lo =
222 384 vmlaq_n_s32(r_base_, v_lo_lo, kUVWeights[kRVWeightIndex]);
223 768 int32x4_t r_even_lo_hi =
224 384 vmlaq_n_s32(r_base_, v_lo_hi, kUVWeights[kRVWeightIndex]);
225 768 int32x4_t r_even_hi_lo =
226 384 vmlaq_n_s32(r_base_, v_hi_lo, kUVWeights[kRVWeightIndex]);
227 768 int32x4_t r_even_hi_hi =
228 384 vmlaq_n_s32(r_base_, v_hi_hi, kUVWeights[kRVWeightIndex]);
229
230 // Odd pixels reuse the same chroma base, so compute them before the even
231 // registers are updated with their Y contribution.
232 384 int32x4_t r_odd_lo_lo = vaddq_s32(r_even_lo_lo, y_odd_lo_lo);
233 384 int32x4_t r_odd_lo_hi = vaddq_s32(r_even_lo_hi, y_odd_lo_hi);
234 384 int32x4_t r_odd_hi_lo = vaddq_s32(r_even_hi_lo, y_odd_hi_lo);
235 384 int32x4_t r_odd_hi_hi = vaddq_s32(r_even_hi_hi, y_odd_hi_hi);
236
237 384 r_even_lo_lo = vaddq_s32(r_even_lo_lo, y_even_lo_lo);
238 384 r_even_lo_hi = vaddq_s32(r_even_lo_hi, y_even_lo_hi);
239 384 r_even_hi_lo = vaddq_s32(r_even_hi_lo, y_even_hi_lo);
240 384 r_even_hi_hi = vaddq_s32(r_even_hi_hi, y_even_hi_hi);
241
242 // Re-interleave and pack the Red channel to u8.
243 // We computed R in four 4-lane chunks split by pixel parity:
244 // r_even_lo_lo (even pixels 0..3), r_even_lo_hi (even 4..7)
245 // r_odd_lo_lo (odd pixels 0..3), r_odd_lo_hi (odd 4..7)
246 // normalize_and_pack(...) saturates → shifts → narrows s32→u8 *and*
247 // interleaves even/odd so the output is in raster order:
248 // [R0, R1, R2, R3, ...] (i.e., even0, odd0, even1, odd1, ...).
249 // r0 packs the first 16 R samples; r1 packs the next 16, which come from
250 // the *_hi_* groups.
251 768 uint8x16_t r0 = normalize_and_pack(r_even_lo_lo, r_even_lo_hi, r_odd_lo_lo,
252 384 r_odd_lo_hi);
253 768 uint8x16_t r1 = normalize_and_pack(r_even_hi_lo, r_even_hi_hi, r_odd_hi_lo,
254 384 r_odd_hi_hi);
255
256 // --- Compute the Green channel ---
257 // Formula: G = Y + (kGU * U + kGV * V) + bias, reusing the shared U and V
258 // samples for both pixels in each YUV422 pair. normalize_and_pack(...)
259 // narrows back to u8 and interleaves even/odd results into raster order.
260 768 int32x4_t g_even_lo_lo =
261 384 vmlaq_n_s32(g_base_, u_lo_lo, kUVWeights[kGUWeightIndex]);
262 768 int32x4_t g_even_lo_hi =
263 384 vmlaq_n_s32(g_base_, u_lo_hi, kUVWeights[kGUWeightIndex]);
264 768 int32x4_t g_even_hi_lo =
265 384 vmlaq_n_s32(g_base_, u_hi_lo, kUVWeights[kGUWeightIndex]);
266 768 int32x4_t g_even_hi_hi =
267 384 vmlaq_n_s32(g_base_, u_hi_hi, kUVWeights[kGUWeightIndex]);
268
269 384 g_even_lo_lo =
270 384 vmlaq_n_s32(g_even_lo_lo, v_lo_lo, kUVWeights[kGVWeightIndex]);
271 384 g_even_lo_hi =
272 384 vmlaq_n_s32(g_even_lo_hi, v_lo_hi, kUVWeights[kGVWeightIndex]);
273 384 g_even_hi_lo =
274 384 vmlaq_n_s32(g_even_hi_lo, v_hi_lo, kUVWeights[kGVWeightIndex]);
275 384 g_even_hi_hi =
276 384 vmlaq_n_s32(g_even_hi_hi, v_hi_hi, kUVWeights[kGVWeightIndex]);
277
278 // Same rationale as for Red: capture odd pixels before the even lanes add
279 // Y.
280 384 int32x4_t g_odd_lo_lo = vaddq_s32(g_even_lo_lo, y_odd_lo_lo);
281 384 int32x4_t g_odd_lo_hi = vaddq_s32(g_even_lo_hi, y_odd_lo_hi);
282 384 int32x4_t g_odd_hi_lo = vaddq_s32(g_even_hi_lo, y_odd_hi_lo);
283 384 int32x4_t g_odd_hi_hi = vaddq_s32(g_even_hi_hi, y_odd_hi_hi);
284
285 384 g_even_lo_lo = vaddq_s32(g_even_lo_lo, y_even_lo_lo);
286 384 g_even_lo_hi = vaddq_s32(g_even_lo_hi, y_even_lo_hi);
287 384 g_even_hi_lo = vaddq_s32(g_even_hi_lo, y_even_hi_lo);
288 384 g_even_hi_hi = vaddq_s32(g_even_hi_hi, y_even_hi_hi);
289
290 768 uint8x16_t g0 = normalize_and_pack(g_even_lo_lo, g_even_lo_hi, g_odd_lo_lo,
291 384 g_odd_lo_hi);
292 768 uint8x16_t g1 = normalize_and_pack(g_even_hi_lo, g_even_hi_hi, g_odd_hi_lo,
293 384 g_odd_hi_hi);
294
295 // --- Compute the Blue channel ---
296 // Formula: B = Y + (kBU * U) + bias, sharing the same U samples across the
297 // even/odd pair before normalize_and_pack(...) interleaves the outputs.
298 768 int32x4_t b_even_lo_lo =
299 384 vmlaq_n_s32(b_base_, u_lo_lo, kUVWeights[kBUWeightIndex]);
300 768 int32x4_t b_even_lo_hi =
301 384 vmlaq_n_s32(b_base_, u_lo_hi, kUVWeights[kBUWeightIndex]);
302 768 int32x4_t b_even_hi_lo =
303 384 vmlaq_n_s32(b_base_, u_hi_lo, kUVWeights[kBUWeightIndex]);
304 768 int32x4_t b_even_hi_hi =
305 384 vmlaq_n_s32(b_base_, u_hi_hi, kUVWeights[kBUWeightIndex]);
306
307 // Blue follows the same ordering so odd lanes are finalized before evens.
308 384 int32x4_t b_odd_lo_lo = vaddq_s32(b_even_lo_lo, y_odd_lo_lo);
309 384 int32x4_t b_odd_lo_hi = vaddq_s32(b_even_lo_hi, y_odd_lo_hi);
310 384 int32x4_t b_odd_hi_lo = vaddq_s32(b_even_hi_lo, y_odd_hi_lo);
311 384 int32x4_t b_odd_hi_hi = vaddq_s32(b_even_hi_hi, y_odd_hi_hi);
312
313 384 b_even_lo_lo = vaddq_s32(b_even_lo_lo, y_even_lo_lo);
314 384 b_even_lo_hi = vaddq_s32(b_even_lo_hi, y_even_lo_hi);
315 384 b_even_hi_lo = vaddq_s32(b_even_hi_lo, y_even_hi_lo);
316 384 b_even_hi_hi = vaddq_s32(b_even_hi_hi, y_even_hi_hi);
317
318 768 uint8x16_t b0 = normalize_and_pack(b_even_lo_lo, b_even_lo_hi, b_odd_lo_lo,
319 384 b_odd_lo_hi);
320 768 uint8x16_t b1 = normalize_and_pack(b_even_hi_lo, b_even_hi_hi, b_odd_hi_lo,
321 384 b_odd_hi_hi);
322
323 if constexpr (dcn > 3) {
324 192 uint8x16x4_t rgba0, rgba1;
325 // Red channel
326 192 rgba0.val[2 - b_idx] = r0;
327 192 rgba1.val[2 - b_idx] = r1;
328 // Green channel
329 192 rgba0.val[1] = g0;
330 192 rgba1.val[1] = g1;
331 // Blue channel
332 192 rgba0.val[b_idx] = b0;
333 192 rgba1.val[b_idx] = b1;
334 // Alpha channel
335 192 rgba0.val[3] = vdupq_n_u8(0xFF);
336 192 rgba1.val[3] = vdupq_n_u8(0xFF);
337 // Store RGB pixels to memory.
338 192 vst4q_u8(rgbx0, rgba0);
339 192 vst4q_u8(rgbx1, rgba1);
340
341 192 } else {
342 192 uint8x16x3_t rgba0, rgba1;
343 // Red channel
344 192 rgba0.val[2 - b_idx] = r0;
345 192 rgba1.val[2 - b_idx] = r1;
346 // Green channel
347 192 rgba0.val[1] = g0;
348 192 rgba1.val[1] = g1;
349 // Blue channel
350 192 rgba0.val[b_idx] = b0;
351 192 rgba1.val[b_idx] = b1;
352 // Store RGB pixels to memory.
353 192 vst3q_u8(rgbx0, rgba0);
354 192 vst3q_u8(rgbx1, rgba1);
355 192 }
356 384 }
357
358 KLEIDICV_FORCE_INLINE
359 35516 static void yuv422_to_rgb(const uint8_t y_rows[2], int32_t u_m128,
360 int32_t v_m128, uint8_t* rgbx_rows[2]) {
361 71032 int32_t r_sub_y =
362 35516 kUVWeights[kRVWeightIndex] * v_m128 + (1 << (kWeightScale - 1));
363 106548 int32_t g_sub_y = kUVWeights[kGUWeightIndex] * u_m128 +
364 71032 kUVWeights[kGVWeightIndex] * v_m128 +
365 (1 << (kWeightScale - 1));
366 71032 int32_t b_sub_y =
367 35516 kUVWeights[kBUWeightIndex] * u_m128 + (1 << (kWeightScale - 1));
368
369
24/24
✓ Branch 0 taken 3550 times.
✓ Branch 1 taken 7100 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 3550 times.
✓ Branch 7 taken 7100 times.
✓ Branch 8 taken 3550 times.
✓ Branch 9 taken 7100 times.
✓ Branch 10 taken 3550 times.
✓ Branch 11 taken 7100 times.
✓ Branch 12 taken 3550 times.
✓ Branch 13 taken 7100 times.
✓ Branch 14 taken 3550 times.
✓ Branch 15 taken 7100 times.
✓ Branch 16 taken 3550 times.
✓ Branch 17 taken 7100 times.
✓ Branch 18 taken 3550 times.
✓ Branch 19 taken 7100 times.
✓ Branch 20 taken 3550 times.
✓ Branch 21 taken 7100 times.
✓ Branch 22 taken 3550 times.
✓ Branch 23 taken 7100 times.
106548 for (size_t selector = 0; selector < 2; ++selector) {
370 71032 int32_t y = kYWeight * std::max(y_rows[selector] - 16, 0);
371 71032 int32_t r = y + r_sub_y;
372 71032 int32_t g = y + g_sub_y;
373 71032 int32_t b = y + b_sub_y;
374
375 71032 r >>= kWeightScale;
376 71032 g >>= kWeightScale;
377 71032 b >>= kWeightScale;
378
379 71032 rgbx_rows[selector][2 - b_idx] = saturating_cast<int32_t, uint8_t>(r);
380 71032 rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
381 71032 rgbx_rows[selector][b_idx] = saturating_cast<int32_t, uint8_t>(b);
382
383 if constexpr (dcn > 3) {
384 42600 rgbx_rows[selector][3] = 0xFF;
385 }
386
387 71032 rgbx_rows[selector] += dcn;
388 71032 }
389 35516 }
390 };
391
392 KLEIDICV_TARGET_FN_ATTRS
393 562 kleidicv_error_t yuv422_to_rgb_u8(const uint8_t* src, size_t src_stride,
394 uint8_t* dst, size_t dst_stride, size_t width,
395 size_t height,
396 kleidicv_color_conversion_t color_format) {
397
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 561 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 561 times.
562 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
398
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 560 times.
561 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
399
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 557 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 555 times.
✓ Branch 4 taken 5 times.
✓ Branch 5 taken 555 times.
560 CHECK_IMAGE_SIZE(width, height);
400
401 // YUV422 packs pixels in pairs: (Y0, U, Y1, V).
402 // Therefore, the image width must be at least 2 and always even.
403
4/4
✓ Branch 0 taken 553 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 543 times.
555 if (width < 2 || (width % 2) != 0) {
404 12 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
405 }
406
407
13/13
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 53 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 1 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 52 times.
✓ Branch 11 taken 52 times.
✓ Branch 12 taken 52 times.
543 switch (color_format) {
408 case KLEIDICV_YUYV_TO_BGR:
409 53 return YUV422ToRGBxOrBGRx<0, 1, 0, 3>::yuv2rgbx_operation(
410 53 src, src_stride, dst, dst_stride, width, height);
411 break;
412 case KLEIDICV_UYVY_TO_BGR:
413 1 return YUV422ToRGBxOrBGRx<0, 0, 1, 3>::yuv2rgbx_operation(
414 1 src, src_stride, dst, dst_stride, width, height);
415 break;
416 case KLEIDICV_YVYU_TO_BGR:
417 1 return YUV422ToRGBxOrBGRx<0, 3, 0, 3>::yuv2rgbx_operation(
418 1 src, src_stride, dst, dst_stride, width, height);
419 break;
420 case KLEIDICV_YUYV_TO_RGB:
421 52 return YUV422ToRGBxOrBGRx<2, 1, 0, 3>::yuv2rgbx_operation(
422 52 src, src_stride, dst, dst_stride, width, height);
423 break;
424 case KLEIDICV_UYVY_TO_RGB:
425 52 return YUV422ToRGBxOrBGRx<2, 0, 1, 3>::yuv2rgbx_operation(
426 52 src, src_stride, dst, dst_stride, width, height);
427 break;
428 case KLEIDICV_YVYU_TO_RGB:
429 52 return YUV422ToRGBxOrBGRx<2, 3, 0, 3>::yuv2rgbx_operation(
430 52 src, src_stride, dst, dst_stride, width, height);
431 break;
432 case KLEIDICV_YUYV_TO_BGRA:
433 52 return YUV422ToRGBxOrBGRx<0, 1, 0, 4>::yuv2rgbx_operation(
434 52 src, src_stride, dst, dst_stride, width, height);
435 break;
436 case KLEIDICV_UYVY_TO_BGRA:
437 52 return YUV422ToRGBxOrBGRx<0, 0, 1, 4>::yuv2rgbx_operation(
438 52 src, src_stride, dst, dst_stride, width, height);
439 break;
440 case KLEIDICV_YVYU_TO_BGRA:
441 52 return YUV422ToRGBxOrBGRx<0, 3, 0, 4>::yuv2rgbx_operation(
442 52 src, src_stride, dst, dst_stride, width, height);
443 break;
444 case KLEIDICV_YUYV_TO_RGBA:
445 52 return YUV422ToRGBxOrBGRx<2, 1, 0, 4>::yuv2rgbx_operation(
446 52 src, src_stride, dst, dst_stride, width, height);
447 break;
448 case KLEIDICV_UYVY_TO_RGBA:
449 52 return YUV422ToRGBxOrBGRx<2, 0, 1, 4>::yuv2rgbx_operation(
450 52 src, src_stride, dst, dst_stride, width, height);
451 break;
452 case KLEIDICV_YVYU_TO_RGBA:
453 52 return YUV422ToRGBxOrBGRx<2, 3, 0, 4>::yuv2rgbx_operation(
454 52 src, src_stride, dst, dst_stride, width, height);
455 break;
456 default:
457 20 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
458 break;
459 }
460 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
461 562 }
462
463 } // namespace kleidicv::neon
464