KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv420p_to_rgb_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 126 126 100.0%
Functions: 25 25 100.0%
Branches: 113 113 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <utility>
6
7 #include "kleidicv/conversions/yuv_to_rgb.h"
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/neon.h"
10 #include "yuv420_to_rgb_neon.h"
11
12 namespace kleidicv::neon {
13 template <bool BGR, bool kAlpha>
14 class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha>,
15 public UnrollOnce,
16 public TryToAvoidTailLoop {
17 public:
18 using VecTraits = neon::VecTraits<uint8_t>;
19 using ScalarType = VecTraits::ScalarType;
20 using VectorType = VecTraits::VectorType;
21 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::de_interleave_indices_;
22 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb;
23 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::v_first_;
24
25 640 explicit YUVpToRGBxOrBGRx(bool is_yv12)
26 640 : YUV420XToRGBxOrBGRx<BGR, kAlpha>(is_yv12) {}
27
28 KLEIDICV_FORCE_INLINE
29 144 void vector_path(VectorType &y0, VectorType &y1, VectorType &y2,
30 VectorType &y3, VectorType &u, VectorType &v,
31 ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
32 // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore)
33 // These are needed to expand each group of 4 bytes into a full 32-bit lane
34 144 uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff,
35 2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff};
36
37 144 uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff,
38 6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff};
39
40 144 uint8x16_t index_hi_lo = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff,
41 10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff};
42
43 144 uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff,
44 14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
45
46 // Expand each 8-bit channel into 32-bit vectors using table lookup and
47 // reinterpret
48 144 int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo));
49 144 int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi));
50 144 int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo));
51 144 int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi));
52
53 144 int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo));
54 144 int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi));
55 144 int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo));
56 144 int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi));
57
58 144 constexpr size_t step = kAlpha ? 4 * 16 : 3 * 16;
59
60 288 yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0,
61 144 rgbx_row_1);
62
63 288 yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi,
64 144 rgbx_row_0 + step, rgbx_row_1 + step);
65 144 }
66
67 // Processes inputs which are not long enough to fit a vector.
68 9936 void scalar_path(size_t length, const ScalarType *y_row_0,
69 const ScalarType *y_row_1, const ScalarType *u_row,
70 const ScalarType *v_row, ScalarType *rgbx_row_0,
71 ScalarType *rgbx_row_1) {
72 9936 const uint8_t *y_rows[2] = {y_row_0, y_row_1};
73 9936 uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1};
74
75 9936 int32_t u_m128 = 0, v_m128 = 0;
76
8/8
✓ Branch 0 taken 2484 times.
✓ Branch 1 taken 8248 times.
✓ Branch 2 taken 2484 times.
✓ Branch 3 taken 8248 times.
✓ Branch 4 taken 2484 times.
✓ Branch 5 taken 8248 times.
✓ Branch 6 taken 2484 times.
✓ Branch 7 taken 8248 times.
42928 for (size_t index = 0; index < length; ++index) {
77 32992 disable_loop_vectorization();
78
79 // There is one {U, V} pair for 4 Y values.
80
8/8
✓ Branch 0 taken 4088 times.
✓ Branch 1 taken 4160 times.
✓ Branch 2 taken 4088 times.
✓ Branch 3 taken 4160 times.
✓ Branch 4 taken 4088 times.
✓ Branch 5 taken 4160 times.
✓ Branch 6 taken 4088 times.
✓ Branch 7 taken 4160 times.
32992 if ((index % 2) == 0) {
81 16640 u_m128 = u_row[0] - 128;
82 16640 v_m128 = v_row[0] - 128;
83 16640 u_row += 1;
84 16640 v_row += 1;
85
8/8
✓ Branch 0 taken 2080 times.
✓ Branch 1 taken 2080 times.
✓ Branch 2 taken 2080 times.
✓ Branch 3 taken 2080 times.
✓ Branch 4 taken 2080 times.
✓ Branch 5 taken 2080 times.
✓ Branch 6 taken 2080 times.
✓ Branch 7 taken 2080 times.
16640 if (v_first_) {
86 8320 std::swap(u_m128, v_m128);
87 8320 }
88 16640 }
89
90 32992 yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows);
91 32992 }
92 9936 }
93 }; // end of class YUVpToRGBxOrBGRx<bool, bool>
94
95 using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
96 using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
97 using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
98 using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
99
100 template <typename OperationType, typename ScalarType>
101 640 kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
102 const ScalarType *src, size_t src_stride,
103 ScalarType *dst, size_t dst_stride,
104 size_t width, size_t height, size_t begin,
105 size_t end) {
106
16/16
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 158 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 158 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 158 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 158 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 158 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 158 times.
✓ Branch 14 taken 2 times.
✓ Branch 15 taken 158 times.
640 CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
107
16/16
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 156 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 156 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 156 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 156 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 156 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 156 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 156 times.
✓ Branch 14 taken 2 times.
✓ Branch 15 taken 156 times.
632 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
108
24/24
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 148 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 148 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 152 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 148 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 148 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 152 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 148 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 148 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 152 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 148 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 148 times.
624 CHECK_IMAGE_SIZE(width, height);
109
110 // Pointer to the start of the U plane.
111 // Since `src` points to a planar YUV buffer, the Y plane comes first,
112 // occupying `src_stride * height` bytes.
113 592 const ScalarType *u = src + src_stride * height;
114 // Pointer to the start of the V plane.
115 // The V plane follows the U plane. Both U and V planes are
116 // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
117 // are often stored in a single contiguous chroma region in memory. Depending
118 // on image height and stride, the starting offset of V may require adjustment
119 // to maintain correct alignment. In particular, when the image height is not
120 // divisible evenly by 4, the chroma rows may not align perfectly, so a
121 // fractional offset (in rows) is applied to calculate the V plane position.
122 // The formula used here accounts for this by adjusting based on row parity,
123 // assuming consistent memory layout across the Y, U, and V planes.
124 1184 const ScalarType *v =
125 592 u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
126
127 // These indices control how U and V row strides are selected across the image
128 // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
129 // two luma (Y) rows. However, when the image height is not divisible by 4,
130 // the mapping between chroma and luma rows becomes asymmetric. Specifically,
131 // when `height % 4 == 2`, the start of the V plane is offset by one chroma
132 // row relative to U.
133 //
134 // This results in U and V rows being interleaved with a phase difference,
135 // which must be accounted for during row-wise traversal. To handle this,
136 // `u_index` and `v_index` are used to alternate the stride selection
137 // independently for U and V across the loop.
138 //
139 // This mechanism ensures that memory access patterns remain correct,
140 // especially in layouts where U and V share a contiguous buffer with
141 // alternating strides. Offsetting `v_index` allows the traversal logic to
142 // maintain correct alignment and prevents misaligned or incorrect reads from
143 // the chroma buffer.
144 592 size_t u_index = 0;
145 592 size_t v_index = height % 4 == 2 ? 1 : 0;
146
147 // Compute the actual row range in the Y plane (full resolution).
148 // Since each UV row maps to 2 Y rows, we double the begin/end indices.
149 592 size_t row_begin = begin * 2;
150 592 size_t row_end = std::min<size_t>(height, end * 2);
151 592 size_t row_uv = begin;
152
153 // UV stepping pattern: first half of row, then padded second half.
154 // Needed to match row strides between chroma and luma components.
155 592 size_t uv_strides[2] = {width / 2, src_stride - width / 2};
156
157 // Calculate starting pointers for Y, U, and V planes at the given stripe
158 // start.
159 592 const ScalarType *y0 = src + row_begin * src_stride;
160 592 u = u + (row_uv / 2) * src_stride;
161 592 v = v + (row_uv / 2) * src_stride;
162
163
8/8
✓ Branch 0 taken 118 times.
✓ Branch 1 taken 30 times.
✓ Branch 2 taken 118 times.
✓ Branch 3 taken 30 times.
✓ Branch 4 taken 118 times.
✓ Branch 5 taken 30 times.
✓ Branch 6 taken 118 times.
✓ Branch 7 taken 30 times.
592 if (row_uv % 2 == 1) {
164 120 u += uv_strides[(u_index++) & 1];
165 120 v += uv_strides[(v_index++) & 1];
166 120 }
167
168 592 size_t dcn = operation.output_channels();
169
8/8
✓ Branch 0 taken 2488 times.
✓ Branch 1 taken 148 times.
✓ Branch 2 taken 2488 times.
✓ Branch 3 taken 148 times.
✓ Branch 4 taken 2488 times.
✓ Branch 5 taken 148 times.
✓ Branch 6 taken 2488 times.
✓ Branch 7 taken 148 times.
10544 for (size_t h = row_begin; h < row_end; h += 2) {
170 9952 ScalarType *row0 = dst + dst_stride * h;
171 9952 ScalarType *row1 = dst + dst_stride * (h + 1);
172 9952 const ScalarType *y1 = y0 + src_stride;
173
174 // Guard for odd-height images.
175 // If the last row in the stripe is unpaired (odd number of rows),
176 // reuse the previous row pointers to avoid out-of-bounds access.
177
8/8
✓ Branch 0 taken 2432 times.
✓ Branch 1 taken 56 times.
✓ Branch 2 taken 2432 times.
✓ Branch 3 taken 56 times.
✓ Branch 4 taken 2432 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 2432 times.
✓ Branch 7 taken 56 times.
9952 if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
178 224 row1 = row0;
179 224 y1 = y0;
180 224 }
181
182 9952 LoopUnroll2 loop{width, kVectorLength};
183
184 10096 loop.unroll_twice([&](size_t index) {
185 144 uint8x16_t u_vec = vld1q_u8(u + index / 2);
186 144 uint8x16_t v_vec = vld1q_u8(v + index / 2);
187 144 uint8x16_t y0_vec = vld1q_u8(y0 + index);
188 144 uint8x16_t y1_vec = vld1q_u8(y1 + index);
189 144 uint8x16_t y2_vec = vld1q_u8(y0 + index + kVectorLength);
190 144 uint8x16_t y3_vec = vld1q_u8(y1 + index + kVectorLength);
191
192 288 operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec,
193 144 &row0[index * dcn], &row1[index * dcn]);
194 144 });
195
196 19888 loop.remaining([&](size_t index, size_t length) {
197 19872 operation.scalar_path(length - index, y0 + index, y1 + index,
198 9936 u + index / 2, v + index / 2, &row0[index * dcn],
199 9936 &row1[index * dcn]);
200 9936 });
201
202 9952 y0 += src_stride * 2;
203 9952 u += uv_strides[(u_index++) & 1];
204 9952 v += uv_strides[(v_index++) & 1];
205 9952 }
206
207 592 return KLEIDICV_OK;
208 640 }
209
210 KLEIDICV_TARGET_FN_ATTRS
211 673 kleidicv_error_t yuv420p_to_rgb_stripe_u8(
212 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
213 size_t width, size_t height, kleidicv_color_conversion_t color_format,
214 size_t begin, size_t end) {
215
9/9
✓ Branch 0 taken 80 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 80 times.
✓ Branch 3 taken 80 times.
✓ Branch 4 taken 80 times.
✓ Branch 5 taken 80 times.
✓ Branch 6 taken 80 times.
✓ Branch 7 taken 33 times.
✓ Branch 8 taken 80 times.
673 switch (color_format) {
216 case KLEIDICV_YV12_TO_BGR: {
217 80 YUVpToBGR operation{true};
218 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
219 80 width, height, begin, end);
220 80 }
221
222 case KLEIDICV_YV12_TO_RGB: {
223 80 YUVpToRGB operation{true};
224 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
225 80 width, height, begin, end);
226 80 }
227
228 case KLEIDICV_YV12_TO_BGRA: {
229 80 YUVpToBGRA operation{true};
230 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
231 80 width, height, begin, end);
232 80 }
233
234 case KLEIDICV_YV12_TO_RGBA: {
235 80 YUVpToRGBA operation{true};
236 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
237 80 width, height, begin, end);
238 80 }
239
240 case KLEIDICV_IYUV_TO_BGR: {
241 80 YUVpToBGR operation{false};
242 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
243 80 width, height, begin, end);
244 80 }
245
246 case KLEIDICV_IYUV_TO_RGB: {
247 80 YUVpToRGB operation{false};
248 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
249 80 width, height, begin, end);
250 80 }
251
252 case KLEIDICV_IYUV_TO_BGRA: {
253 80 YUVpToBGRA operation{false};
254 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
255 80 width, height, begin, end);
256 80 }
257
258 case KLEIDICV_IYUV_TO_RGBA: {
259 80 YUVpToRGBA operation{false};
260 160 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride,
261 80 width, height, begin, end);
262 80 }
263
264 default:
265 33 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
266 }
267
268 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
269 673 }
270
271 } // namespace kleidicv::neon
272