KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv_p_to_rgb_sc.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 109 109 100.0%
Functions: 88 96 91.7%
Branches: 72 72 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_YUV_P_TO_RGB_SC_H
6 #define KLEIDICV_YUV_P_TO_RGB_SC_H
7
8 #include <algorithm>
9
10 #include "kleidicv/conversions/yuv_420_to_rgb.h"
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/sve2.h"
13 #include "yuv420_to_rgb_sc.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 template <bool BGR, bool kAlpha>
18 class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha> {
19 public:
20 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb;
21
22 1312 explicit YUVpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING
23 1312 : YUV420XToRGBxOrBGRx<BGR, kAlpha>(v_first) {}
24
25 // Returns the number of channels in the output image.
26 1088 static constexpr size_t output_channels() KLEIDICV_STREAMING {
27 1088 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
28 }
29
30 // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and
31 // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration.
32 21600 void vector_path(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u,
33 svint16_t &v, uint8_t *rgbx_row_0,
34 uint8_t *rgbx_row_1) KLEIDICV_STREAMING {
35 21600 yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1);
36 21600 }
37 }; // end of class YUVpToRGBxOrBGRx<bool, bool>
38
39 using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
40 using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
41 using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
42 using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
43
44 template <typename OperationType, typename ScalarType>
45 1312 kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
46 const ScalarType *src, size_t src_stride,
47 ScalarType *dst, size_t dst_stride,
48 size_t width, size_t height, size_t begin,
49 size_t end) KLEIDICV_STREAMING {
50
16/16
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 314 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 314 times.
✓ Branch 4 taken 14 times.
✓ Branch 5 taken 314 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 314 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 314 times.
✓ Branch 10 taken 14 times.
✓ Branch 11 taken 314 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 314 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 314 times.
1312 CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
51
16/16
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 300 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 300 times.
✓ Branch 4 taken 14 times.
✓ Branch 5 taken 300 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 300 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 300 times.
✓ Branch 10 taken 14 times.
✓ Branch 11 taken 300 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 300 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 300 times.
1256 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
52
24/24
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 286 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 272 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 272 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 286 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 272 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 272 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 286 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 272 times.
✓ Branch 16 taken 28 times.
✓ Branch 17 taken 272 times.
✓ Branch 18 taken 14 times.
✓ Branch 19 taken 286 times.
✓ Branch 20 taken 14 times.
✓ Branch 21 taken 272 times.
✓ Branch 22 taken 28 times.
✓ Branch 23 taken 272 times.
1200 CHECK_IMAGE_SIZE(width, height);
53
54 // Pointer to the start of the U plane.
55 // Since `src` points to a planar YUV buffer, the Y plane comes first,
56 // occupying `src_stride * height` bytes.
57 1088 const ScalarType *u = src + src_stride * height;
58 // Pointer to the start of the V plane.
59 // The V plane follows the U plane. Both U and V planes are
60 // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
61 // are often stored in a single contiguous chroma region in memory. Depending
62 // on image height and stride, the starting offset of V may require adjustment
63 // to maintain correct alignment. In particular, when the image height is not
64 // divisible evenly by 4, the chroma rows may not align perfectly, so a
65 // fractional offset (in rows) is applied to calculate the V plane position.
66 // The formula used here accounts for this by adjusting based on row parity,
67 // assuming consistent memory layout across the Y, U, and V planes.
68 2176 const ScalarType *v =
69 1088 u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
70
71 // These indices control how U and V row strides are selected across the image
72 // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
73 // two luma (Y) rows. However, when the image height is not divisible by 4,
74 // the mapping between chroma and luma rows becomes asymmetric. Specifically,
75 // when `height % 4 == 2`, the start of the V plane is offset by one chroma
76 // row relative to U.
77 //
78 // This results in U and V rows being interleaved with a phase difference,
79 // which must be accounted for during row-wise traversal. To handle this,
80 // `u_index` and `v_index` are used to alternate the stride selection
81 // independently for U and V across the loop.
82 //
83 // This mechanism ensures that memory access patterns remain correct,
84 // especially in layouts where U and V share a contiguous buffer with
85 // alternating strides. Offsetting `v_index` allows the traversal logic to
86 // maintain correct alignment and prevents misaligned or incorrect reads from
87 // the chroma buffer.
88 1088 size_t u_index = 0;
89 1088 size_t v_index = height % 4 == 2 ? 1 : 0;
90
91 // Compute the actual row range in the Y plane (full resolution).
92 // Since each UV row maps to 2 Y rows, we double the begin/end indices.
93 1088 size_t row_begin = begin * 2;
94 1088 size_t row_end = std::min<size_t>(height, end * 2);
95 1088 size_t row_uv = begin;
96
97 // UV stepping pattern: first half of row, then padded second half.
98 // Needed to match row strides between chroma and luma components.
99 1088 size_t uv_strides[2] = {width / 2, src_stride - width / 2};
100
101 // Calculate starting pointers for Y, U, and V planes at the given stripe
102 // start.
103 1088 const ScalarType *y0 = src + row_begin * src_stride;
104 1088 u = u + row_uv * src_stride / 2;
105 1088 v = v + row_uv * src_stride / 2;
106
107 1088 size_t dcn = operation.output_channels();
108 1088 const size_t kVectorLength = svcntb();
109
8/8
✓ Branch 0 taken 5162 times.
✓ Branch 1 taken 272 times.
✓ Branch 2 taken 5162 times.
✓ Branch 3 taken 272 times.
✓ Branch 4 taken 5162 times.
✓ Branch 5 taken 272 times.
✓ Branch 6 taken 5162 times.
✓ Branch 7 taken 272 times.
21736 for (size_t h = row_begin; h < row_end; h += 2) {
110 20648 ScalarType *row0 = dst + dst_stride * h;
111 20648 ScalarType *row1 = dst + dst_stride * (h + 1);
112 20648 const ScalarType *y1 = y0 + src_stride;
113
114 // Guard for odd-height images.
115 // If the last row in the stripe is unpaired (odd number of rows),
116 // reuse the previous row pointers to avoid out-of-bounds access.
117
8/8
✓ Branch 0 taken 5024 times.
✓ Branch 1 taken 138 times.
✓ Branch 2 taken 5024 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 5024 times.
✓ Branch 5 taken 138 times.
✓ Branch 6 taken 5024 times.
✓ Branch 7 taken 138 times.
20648 if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
118 552 row1 = row0;
119 552 y1 = y0;
120 552 }
121
122 20648 LoopUnroll2 loop{width, svcntb()};
123
124 21080 loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING {
125 432 svbool_t pg = svptrue_b8();
126 432 svuint8_t u8_vec = svld1(pg, u + index / 2);
127 432 svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
128 432 svint16_t u_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(u8_vec));
129
130 432 svuint8_t v8_vec = svld1(pg, v + index / 2);
131 432 svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
132 432 svint16_t v_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(v8_vec));
133
134 #if KLEIDICV_TARGET_SME2
135 // assume the predicate is full true
136 144 svcount_t pg_counter = svptrue_c8();
137 144 svuint8x2_t y_even = svld1_x2(pg_counter, y0 + index);
138 144 svuint8x2_t y_odd = svld1_x2(pg_counter, y1 + index);
139 144 svuint8_t y0_vec = svget2(y_even, 0);
140 144 svuint8_t y1_vec = svget2(y_odd, 0);
141 144 svuint8_t y2_vec = svget2(y_even, 1);
142 144 svuint8_t y3_vec = svget2(y_odd, 1);
143 #else
144 288 svuint8_t y0_vec = svld1(pg, y0 + index);
145 288 svuint8_t y1_vec = svld1(pg, y1 + index);
146 288 svuint8_t y2_vec = svld1(pg, y0 + index + kVectorLength);
147 288 svuint8_t y3_vec = svld1(pg, y1 + index + kVectorLength);
148 #endif // KLEIDICV_TARGET_SME2
149
150 864 operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
151 432 &row0[index * dcn], &row1[index * dcn]);
152
153 864 operation.vector_path(pg, y2_vec, y3_vec, u_vec_hi, v_vec_hi,
154 432 &row0[(index + kVectorLength) * dcn],
155 432 &row1[(index + kVectorLength) * dcn]);
156 432 });
157
158 20944 loop.unroll_once([&](size_t index) KLEIDICV_STREAMING {
159 296 svbool_t pg = svptrue_b8();
160 296 svbool_t pg_half = svwhilelt_b8(0UL, svcntb() / 2);
161
162 296 svuint8_t u8_vec = svld1(pg_half, u + index / 2);
163 296 svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
164
165 296 svuint8_t v8_vec = svld1(pg_half, v + index / 2);
166 296 svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
167
168 296 svuint8_t y0_vec = svld1(pg, y0 + index);
169 296 svuint8_t y1_vec = svld1(pg, y1 + index);
170
171 592 operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
172 296 &row0[index * dcn], &row1[index * dcn]);
173 296 });
174
175 41088 loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING {
176 20440 svbool_t pg = svwhilelt_b8_u64(index, length);
177 20440 svbool_t pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) >> 1);
178
179 20440 svuint8_t u8_vec = svld1(pg_half, u + index / 2);
180 20440 svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
181
182 20440 svuint8_t v8_vec = svld1(pg_half, v + index / 2);
183 20440 svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
184
185 20440 svuint8_t y0_vec = svld1(pg, y0 + index);
186 20440 svuint8_t y1_vec = svld1(pg, y1 + index);
187
188 40880 operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
189 20440 &row0[index * dcn], &row1[index * dcn]);
190 20440 });
191
192 20648 y0 += src_stride * 2;
193 20648 u += uv_strides[(u_index++) & 1];
194 20648 v += uv_strides[(v_index++) & 1];
195 20648 }
196
197 1088 return KLEIDICV_OK;
198 1312 }
199
200 KLEIDICV_TARGET_FN_ATTRS
201 328 static kleidicv_error_t yuv_p_to_rgb_stripe_u8_sc(
202 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
203 size_t width, size_t height, bool v_first, size_t begin,
204 size_t end) KLEIDICV_STREAMING {
205 328 YUVpToRGB operation{v_first};
206 984 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
207 328 height, begin, end);
208 328 }
209
210 KLEIDICV_TARGET_FN_ATTRS
211 328 static kleidicv_error_t yuv_p_to_rgba_stripe_u8_sc(
212 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
213 size_t width, size_t height, bool v_first, size_t begin,
214 size_t end) KLEIDICV_STREAMING {
215 328 YUVpToRGBA operation{v_first};
216 984 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
217 328 height, begin, end);
218 328 }
219
220 KLEIDICV_TARGET_FN_ATTRS
221 328 static kleidicv_error_t yuv_p_to_bgr_stripe_u8_sc(
222 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
223 size_t width, size_t height, bool v_first, size_t begin,
224 size_t end) KLEIDICV_STREAMING {
225 328 YUVpToBGR operation{v_first};
226 984 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
227 328 height, begin, end);
228 328 }
229
230 KLEIDICV_TARGET_FN_ATTRS
231 328 static kleidicv_error_t yuv_p_to_bgra_stripe_u8_sc(
232 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
233 size_t width, size_t height, bool v_first, size_t begin,
234 size_t end) KLEIDICV_STREAMING {
235 328 YUVpToBGRA operation{v_first};
236 984 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
237 328 height, begin, end);
238 328 }
239 } // namespace KLEIDICV_TARGET_NAMESPACE
240
241 #endif // KLEIDICV_YUV_P_TO_RGB_SC_H
242