KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv420p_to_rgb_sc.h
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 133 133 100.0%
Functions: 79 87 90.8%
Branches: 89 89 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_YUV420P_TO_RGB_SC_H
6 #define KLEIDICV_YUV420P_TO_RGB_SC_H
7
8 #include <algorithm>
9
10 #include "kleidicv/conversions/yuv_to_rgb.h"
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/sve2.h"
13 #include "yuv420_to_rgb_sc.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 template <bool BGR, bool kAlpha>
18 class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha> {
19 public:
20 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb;
21
22 2352 explicit YUVpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING
23 2352 : YUV420XToRGBxOrBGRx<BGR, kAlpha>(v_first) {}
24
25 // Returns the number of channels in the output image.
26 2192 static constexpr size_t output_channels() KLEIDICV_STREAMING {
27 2192 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
28 }
29
30 // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and
31 // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration.
32 KLEIDICV_FORCE_INLINE
33 40032 void vector_path(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u,
34 svint16_t &v, uint8_t *rgbx_row_0,
35 uint8_t *rgbx_row_1) const KLEIDICV_STREAMING {
36 40032 yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1);
37 40032 }
38 }; // end of class YUVpToRGBxOrBGRx<bool, bool>
39
40 using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
41 using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
42 using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
43 using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
44
45 template <typename OperationType, typename ScalarType>
46 2352 kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
47 const ScalarType *src, size_t src_stride,
48 ScalarType *dst, size_t dst_stride,
49 size_t width, size_t height, size_t begin,
50 size_t end) KLEIDICV_STREAMING {
51
16/16
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 582 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 582 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 582 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 582 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 582 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 582 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 582 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 582 times.
2352 CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
52
16/16
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 576 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 576 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 576 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 576 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 576 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 576 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 576 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 576 times.
2328 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
53
24/24
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 562 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 548 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 548 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 562 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 548 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 548 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 562 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 548 times.
✓ Branch 16 taken 28 times.
✓ Branch 17 taken 548 times.
✓ Branch 18 taken 14 times.
✓ Branch 19 taken 562 times.
✓ Branch 20 taken 14 times.
✓ Branch 21 taken 548 times.
✓ Branch 22 taken 28 times.
✓ Branch 23 taken 548 times.
2304 CHECK_IMAGE_SIZE(width, height);
54
55 // Pointer to the start of the U plane.
56 // Since `src` points to a planar YUV buffer, the Y plane comes first,
57 // occupying `src_stride * height` bytes.
58 2192 const ScalarType *u = src + src_stride * height;
59 // Pointer to the start of the V plane.
60 // The V plane follows the U plane. Both U and V planes are
61 // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
62 // are often stored in a single contiguous chroma region in memory. Depending
63 // on image height and stride, the starting offset of V may require adjustment
64 // to maintain correct alignment. In particular, when the image height is not
65 // divisible evenly by 4, the chroma rows may not align perfectly, so a
66 // fractional offset (in rows) is applied to calculate the V plane position.
67 // The formula used here accounts for this by adjusting based on row parity,
68 // assuming consistent memory layout across the Y, U, and V planes.
69 4384 const ScalarType *v =
70 2192 u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
71
72 // These indices control how U and V row strides are selected across the image
73 // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
74 // two luma (Y) rows. However, when the image height is not divisible by 4,
75 // the mapping between chroma and luma rows becomes asymmetric. Specifically,
76 // when `height % 4 == 2`, the start of the V plane is offset by one chroma
77 // row relative to U.
78 //
79 // This results in U and V rows being interleaved with a phase difference,
80 // which must be accounted for during row-wise traversal. To handle this,
81 // `u_index` and `v_index` are used to alternate the stride selection
82 // independently for U and V across the loop.
83 //
84 // This mechanism ensures that memory access patterns remain correct,
85 // especially in layouts where U and V share a contiguous buffer with
86 // alternating strides. Offsetting `v_index` allows the traversal logic to
87 // maintain correct alignment and prevents misaligned or incorrect reads from
88 // the chroma buffer.
89 2192 size_t u_index = 0;
90 2192 size_t v_index = height % 4 == 2 ? 1 : 0;
91
92 // Compute the actual row range in the Y plane (full resolution).
93 // Since each UV row maps to 2 Y rows, we double the begin/end indices.
94 2192 size_t row_begin = begin * 2;
95 2192 size_t row_end = std::min<size_t>(height, end * 2);
96 2192 size_t row_uv = begin;
97
98 // UV stepping pattern: first half of row, then padded second half.
99 // Needed to match row strides between chroma and luma components.
100 2192 size_t uv_strides[2] = {width / 2, src_stride - width / 2};
101
102 // Calculate starting pointers for Y, U, and V planes at the given stripe
103 // start.
104 2192 const ScalarType *y0 = src + row_begin * src_stride;
105 2192 u = u + (row_uv / 2) * src_stride;
106 2192 v = v + (row_uv / 2) * src_stride;
107
108
8/8
✓ Branch 0 taken 428 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 428 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 428 times.
✓ Branch 5 taken 120 times.
✓ Branch 6 taken 428 times.
✓ Branch 7 taken 120 times.
2192 if (row_uv % 2 == 1) {
109 480 u += uv_strides[(u_index++) & 1];
110 480 v += uv_strides[(v_index++) & 1];
111 480 }
112
113 2192 size_t dcn = operation.output_channels();
114 2192 const size_t kVectorLength = svcntb();
115
8/8
✓ Branch 0 taken 9770 times.
✓ Branch 1 taken 548 times.
✓ Branch 2 taken 9770 times.
✓ Branch 3 taken 548 times.
✓ Branch 4 taken 9770 times.
✓ Branch 5 taken 548 times.
✓ Branch 6 taken 9770 times.
✓ Branch 7 taken 548 times.
41272 for (size_t h = row_begin; h < row_end; h += 2) {
116 39080 ScalarType *row0 = dst + dst_stride * h;
117 39080 ScalarType *row1 = dst + dst_stride * (h + 1);
118 39080 const ScalarType *y1 = y0 + src_stride;
119
120 // Guard for odd-height images.
121 // If the last row in the stripe is unpaired (odd number of rows),
122 // reuse the previous row pointers to avoid out-of-bounds access.
123
8/8
✓ Branch 0 taken 9568 times.
✓ Branch 1 taken 202 times.
✓ Branch 2 taken 9568 times.
✓ Branch 3 taken 202 times.
✓ Branch 4 taken 9568 times.
✓ Branch 5 taken 202 times.
✓ Branch 6 taken 9568 times.
✓ Branch 7 taken 202 times.
39080 if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
124 808 row1 = row0;
125 808 y1 = y0;
126 808 }
127
128 39080 LoopUnroll2 loop{width, svcntb()};
129
130 struct VectorPath2x {
131 const ScalarType *y0, *y1, *u, *v;
132 ScalarType *row0, *row1;
133 const size_t kVectorLength, dcn;
134 OperationType operation;
135 KLEIDICV_FORCE_INLINE
136 432 void operator()(size_t index) const KLEIDICV_STREAMING {
137 432 svbool_t pg = svptrue_b8();
138 432 svuint8_t u8_vec = svld1(pg, u + index / 2);
139 432 svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
140 432 svint16_t u_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(u8_vec));
141
142 432 svuint8_t v8_vec = svld1(pg, v + index / 2);
143 432 svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
144 432 svint16_t v_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(v8_vec));
145
146 #if KLEIDICV_TARGET_SME2
147 // assume the predicate is full true
148 144 svcount_t pg_counter = svptrue_c8();
149 144 svuint8x2_t y_even = svld1_x2(pg_counter, y0 + index);
150 144 svuint8x2_t y_odd = svld1_x2(pg_counter, y1 + index);
151 144 svuint8_t y0_vec = svget2(y_even, 0);
152 144 svuint8_t y1_vec = svget2(y_odd, 0);
153 144 svuint8_t y2_vec = svget2(y_even, 1);
154 144 svuint8_t y3_vec = svget2(y_odd, 1);
155 #else
156 288 svuint8_t y0_vec = svld1(pg, y0 + index);
157 288 svuint8_t y1_vec = svld1(pg, y1 + index);
158 288 svuint8_t y2_vec = svld1(pg, y0 + index + kVectorLength);
159 288 svuint8_t y3_vec = svld1(pg, y1 + index + kVectorLength);
160 #endif // KLEIDICV_TARGET_SME2
161
162 864 operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
163 432 &row0[index * dcn], &row1[index * dcn]);
164
165 864 operation.vector_path(pg, y2_vec, y3_vec, u_vec_hi, v_vec_hi,
166 432 &row0[(index + kVectorLength) * dcn],
167 432 &row1[(index + kVectorLength) * dcn]);
168 432 }
169 };
170 39080 loop.unroll_twice(
171 39080 VectorPath2x{y0, y1, u, v, row0, row1, kVectorLength, dcn, operation});
172
173 struct VectorPath1x {
174 const ScalarType *y0, *y1, *u, *v;
175 ScalarType *row0, *row1;
176 const size_t dcn;
177 OperationType operation;
178 KLEIDICV_FORCE_INLINE
179 448 void operator()(size_t index) const KLEIDICV_STREAMING {
180 448 svbool_t pg = svptrue_b8();
181 448 svbool_t pg_half = svwhilelt_b8(0UL, svcntb() / 2);
182
183 448 svuint8_t u8_vec = svld1(pg_half, u + index / 2);
184 448 svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
185
186 448 svuint8_t v8_vec = svld1(pg_half, v + index / 2);
187 448 svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
188
189 448 svuint8_t y0_vec = svld1(pg, y0 + index);
190 448 svuint8_t y1_vec = svld1(pg, y1 + index);
191
192 896 operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
193 448 &row0[index * dcn], &row1[index * dcn]);
194 448 }
195 };
196 39080 loop.unroll_once(VectorPath1x{y0, y1, u, v, row0, row1, dcn, operation});
197
198 struct RemainingPath {
199 const ScalarType *y0, *y1, *u, *v;
200 ScalarType *row0, *row1;
201 const size_t dcn;
202 OperationType operation;
203 KLEIDICV_FORCE_INLINE
204 38720 void operator()(size_t index, size_t length) const KLEIDICV_STREAMING {
205 38720 svbool_t pg = svwhilelt_b8_u64(index, length);
206 38720 svbool_t pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) >> 1);
207
208 38720 svuint8_t u8_vec = svld1(pg_half, u + index / 2);
209 38720 svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
210
211 38720 svuint8_t v8_vec = svld1(pg_half, v + index / 2);
212 38720 svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
213
214 38720 svuint8_t y0_vec = svld1(pg, y0 + index);
215 38720 svuint8_t y1_vec = svld1(pg, y1 + index);
216
217 77440 operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
218 38720 &row0[index * dcn], &row1[index * dcn]);
219 38720 }
220 };
221 39080 loop.remaining(RemainingPath{y0, y1, u, v, row0, row1, dcn, operation});
222
223 39080 y0 += src_stride * 2;
224 39080 u += uv_strides[(u_index++) & 1];
225 39080 v += uv_strides[(v_index++) & 1];
226 39080 }
227
228 2192 return KLEIDICV_OK;
229 2352 }
230
231 KLEIDICV_TARGET_FN_ATTRS
232 2476 static kleidicv_error_t yuv420p_to_rgb_stripe_u8_sc(
233 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
234 size_t width, size_t height, kleidicv_color_conversion_t color_format,
235 size_t begin, size_t end) KLEIDICV_STREAMING {
236
9/9
✓ Branch 0 taken 294 times.
✓ Branch 1 taken 294 times.
✓ Branch 2 taken 294 times.
✓ Branch 3 taken 294 times.
✓ Branch 4 taken 294 times.
✓ Branch 5 taken 294 times.
✓ Branch 6 taken 294 times.
✓ Branch 7 taken 124 times.
✓ Branch 8 taken 294 times.
2476 switch (color_format) {
237 case KLEIDICV_YV12_TO_BGR: {
238 294 YUVpToBGR operation{true};
239 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
240 294 width, height, begin, end);
241 294 }
242
243 case KLEIDICV_YV12_TO_RGB: {
244 294 YUVpToRGB operation{true};
245 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
246 294 width, height, begin, end);
247 294 }
248
249 case KLEIDICV_YV12_TO_BGRA: {
250 294 YUVpToBGRA operation{true};
251 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
252 294 width, height, begin, end);
253 294 }
254
255 case KLEIDICV_YV12_TO_RGBA: {
256 294 YUVpToRGBA operation{true};
257 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
258 294 width, height, begin, end);
259 294 }
260
261 case KLEIDICV_IYUV_TO_BGR: {
262 294 YUVpToBGR operation{false};
263 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
264 294 width, height, begin, end);
265 294 }
266
267 case KLEIDICV_IYUV_TO_RGB: {
268 294 YUVpToRGB operation{false};
269 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
270 294 width, height, begin, end);
271 294 }
272
273 case KLEIDICV_IYUV_TO_BGRA: {
274 294 YUVpToBGRA operation{false};
275 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
276 294 width, height, begin, end);
277 294 }
278
279 case KLEIDICV_IYUV_TO_RGBA: {
280 294 YUVpToRGBA operation{false};
281 588 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
282 294 width, height, begin, end);
283 294 }
284
285 default:
286 124 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
287 }
288
289 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
290 2476 }
291
292 } // namespace KLEIDICV_TARGET_NAMESPACE
293
294 #endif // KLEIDICV_YUV420P_TO_RGB_SC_H
295