KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 106 106 100.0%
Functions: 28 28 100.0%
Branches: 96 96 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <utility>
6
7 #include "kleidicv/conversions/yuv_420_to_rgb.h"
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/neon.h"
10 #include "yuv420_to_rgb_neon.h"
11
12 namespace kleidicv::neon {
13 template <bool BGR, bool kAlpha>
14 class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha>,
15 public UnrollOnce,
16 public TryToAvoidTailLoop {
17 public:
18 using VecTraits = neon::VecTraits<uint8_t>;
19 using ScalarType = VecTraits::ScalarType;
20 using VectorType = VecTraits::VectorType;
21 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::de_interleave_indices_;
22 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb;
23 using YUV420XToRGBxOrBGRx<BGR, kAlpha>::v_first_;
24
25 380 explicit YUVpToRGBxOrBGRx(bool is_yv12)
26 380 : YUV420XToRGBxOrBGRx<BGR, kAlpha>(is_yv12) {}
27
28 288 void vector_path(VectorType &y0, VectorType &y1, VectorType &y2,
29 VectorType &y3, VectorType &u, VectorType &v,
30 ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
31 // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore)
32 // These are needed to expand each group of 4 bytes into a full 32-bit lane
33 288 uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff,
34 2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff};
35
36 288 uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff,
37 6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff};
38
39 288 uint8x16_t index_hi_lo = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff,
40 10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff};
41
42 288 uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff,
43 14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
44
45 // Expand each 8-bit channel into 32-bit vectors using table lookup and
46 // reinterpret
47 288 int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo));
48 288 int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi));
49 288 int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo));
50 288 int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi));
51
52 288 int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo));
53 288 int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi));
54 288 int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo));
55 288 int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi));
56
57 288 constexpr size_t step = kAlpha ? 4 * 16 : 3 * 16;
58
59 576 yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0,
60 288 rgbx_row_1);
61
62 576 yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi,
63 288 rgbx_row_0 + step, rgbx_row_1 + step);
64 288 }
65
66 // Processes inputs which are not long enough to fit a vector.
67 5184 void scalar_path(size_t length, const ScalarType *y_row_0,
68 const ScalarType *y_row_1, const ScalarType *u_row,
69 const ScalarType *v_row, ScalarType *rgbx_row_0,
70 ScalarType *rgbx_row_1) {
71 5184 const uint8_t *y_rows[2] = {y_row_0, y_row_1};
72 5184 uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1};
73
74 5184 int32_t u_m128 = 0, v_m128 = 0;
75
8/8
✓ Branch 0 taken 1296 times.
✓ Branch 1 taken 4646 times.
✓ Branch 2 taken 1296 times.
✓ Branch 3 taken 4646 times.
✓ Branch 4 taken 1296 times.
✓ Branch 5 taken 4646 times.
✓ Branch 6 taken 1296 times.
✓ Branch 7 taken 4646 times.
23768 for (size_t index = 0; index < length; ++index) {
76 18584 disable_loop_vectorization();
77
78 // There is one {U, V} pair for 4 Y values.
79
8/8
✓ Branch 0 taken 2314 times.
✓ Branch 1 taken 2332 times.
✓ Branch 2 taken 2314 times.
✓ Branch 3 taken 2332 times.
✓ Branch 4 taken 2314 times.
✓ Branch 5 taken 2332 times.
✓ Branch 6 taken 2314 times.
✓ Branch 7 taken 2332 times.
18584 if ((index % 2) == 0) {
80 9328 u_m128 = u_row[0] - 128;
81 9328 v_m128 = v_row[0] - 128;
82 9328 u_row += 1;
83 9328 v_row += 1;
84
8/8
✓ Branch 0 taken 2062 times.
✓ Branch 1 taken 270 times.
✓ Branch 2 taken 2062 times.
✓ Branch 3 taken 270 times.
✓ Branch 4 taken 2062 times.
✓ Branch 5 taken 270 times.
✓ Branch 6 taken 2062 times.
✓ Branch 7 taken 270 times.
9328 if (v_first_) {
85 1080 std::swap(u_m128, v_m128);
86 1080 }
87 9328 }
88
89 18584 yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows);
90 18584 }
91 5184 }
92 }; // end of class YUVpToRGBxOrBGRx<bool, bool>
93
94 using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
95 using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
96 using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
97 using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
98
99 template <typename OperationType, typename ScalarType>
100 380 kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
101 const ScalarType *src, size_t src_stride,
102 ScalarType *dst, size_t dst_stride,
103 size_t width, size_t height, size_t begin,
104 size_t end) {
105
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 91 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 91 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 91 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 91 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 91 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 91 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 91 times.
380 CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
106
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 87 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 87 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 87 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 87 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 87 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 87 times.
364 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
107
24/24
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 83 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 83 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 79 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 79 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 83 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 79 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 79 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 83 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 79 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 79 times.
348 CHECK_IMAGE_SIZE(width, height);
108
109 // Pointer to the start of the U plane.
110 // Since `src` points to a planar YUV buffer, the Y plane comes first,
111 // occupying `src_stride * height` bytes.
112 316 const ScalarType *u = src + src_stride * height;
113 // Pointer to the start of the V plane.
114 // The V plane follows the U plane. Both U and V planes are
115 // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
116 // are often stored in a single contiguous chroma region in memory. Depending
117 // on image height and stride, the starting offset of V may require adjustment
118 // to maintain correct alignment. In particular, when the image height is not
119 // divisible evenly by 4, the chroma rows may not align perfectly, so a
120 // fractional offset (in rows) is applied to calculate the V plane position.
121 // The formula used here accounts for this by adjusting based on row parity,
122 // assuming consistent memory layout across the Y, U, and V planes.
123 632 const ScalarType *v =
124 316 u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
125
126 // These indices control how U and V row strides are selected across the image
127 // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
128 // two luma (Y) rows. However, when the image height is not divisible by 4,
129 // the mapping between chroma and luma rows becomes asymmetric. Specifically,
130 // when `height % 4 == 2`, the start of the V plane is offset by one chroma
131 // row relative to U.
132 //
133 // This results in U and V rows being interleaved with a phase difference,
134 // which must be accounted for during row-wise traversal. To handle this,
135 // `u_index` and `v_index` are used to alternate the stride selection
136 // independently for U and V across the loop.
137 //
138 // This mechanism ensures that memory access patterns remain correct,
139 // especially in layouts where U and V share a contiguous buffer with
140 // alternating strides. Offsetting `v_index` allows the traversal logic to
141 // maintain correct alignment and prevents misaligned or incorrect reads from
142 // the chroma buffer.
143 316 size_t u_index = 0;
144 316 size_t v_index = height % 4 == 2 ? 1 : 0;
145
146 // Compute the actual row range in the Y plane (full resolution).
147 // Since each UV row maps to 2 Y rows, we double the begin/end indices.
148 316 size_t row_begin = begin * 2;
149 316 size_t row_end = std::min<size_t>(height, end * 2);
150 316 size_t row_uv = begin;
151
152 // UV stepping pattern: first half of row, then padded second half.
153 // Needed to match row strides between chroma and luma components.
154 316 size_t uv_strides[2] = {width / 2, src_stride - width / 2};
155
156 // Calculate starting pointers for Y, U, and V planes at the given stripe
157 // start.
158 316 const ScalarType *y0 = src + row_begin * src_stride;
159 316 u = u + row_uv * src_stride / 2;
160 316 v = v + row_uv * src_stride / 2;
161
162 316 size_t dcn = operation.output_channels();
163
8/8
✓ Branch 0 taken 1336 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1336 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 1336 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 1336 times.
✓ Branch 7 taken 79 times.
5660 for (size_t h = row_begin; h < row_end; h += 2) {
164 5344 ScalarType *row0 = dst + dst_stride * h;
165 5344 ScalarType *row1 = dst + dst_stride * (h + 1);
166 5344 const ScalarType *y1 = y0 + src_stride;
167
168 // Guard for odd-height images.
169 // If the last row in the stripe is unpaired (odd number of rows),
170 // reuse the previous row pointers to avoid out-of-bounds access.
171
8/8
✓ Branch 0 taken 1296 times.
✓ Branch 1 taken 40 times.
✓ Branch 2 taken 1296 times.
✓ Branch 3 taken 40 times.
✓ Branch 4 taken 1296 times.
✓ Branch 5 taken 40 times.
✓ Branch 6 taken 1296 times.
✓ Branch 7 taken 40 times.
5344 if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
172 160 row1 = row0;
173 160 y1 = y0;
174 160 }
175
176 5344 LoopUnroll2 loop{width, kVectorLength};
177
178 5632 loop.unroll_twice([&](size_t index) {
179 288 uint8x16_t u_vec = vld1q_u8(u + index / 2);
180 288 uint8x16_t v_vec = vld1q_u8(v + index / 2);
181 288 uint8x16_t y0_vec = vld1q_u8(y0 + index);
182 288 uint8x16_t y1_vec = vld1q_u8(y1 + index);
183 288 uint8x16_t y2_vec = vld1q_u8(y0 + index + kVectorLength);
184 288 uint8x16_t y3_vec = vld1q_u8(y1 + index + kVectorLength);
185
186 576 operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec,
187 288 &row0[index * dcn], &row1[index * dcn]);
188 288 });
189
190 10528 loop.remaining([&](size_t index, size_t length) {
191 10368 operation.scalar_path(length - index, y0 + index, y1 + index,
192 5184 u + index / 2, v + index / 2, &row0[index * dcn],
193 5184 &row1[index * dcn]);
194 5184 });
195
196 5344 y0 += src_stride * 2;
197 5344 u += uv_strides[(u_index++) & 1];
198 5344 v += uv_strides[(v_index++) & 1];
199 5344 }
200
201 316 return KLEIDICV_OK;
202 380 }
203
204 KLEIDICV_TARGET_FN_ATTRS
205 95 kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
206 uint8_t *dst, size_t dst_stride,
207 size_t width, size_t height,
208 bool is_yv12, size_t begin,
209 size_t end) {
210 95 YUVpToRGB operation{is_yv12};
211 285 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
212 95 height, begin, end);
213 95 }
214
215 KLEIDICV_TARGET_FN_ATTRS
216 95 kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
217 uint8_t *dst, size_t dst_stride,
218 size_t width, size_t height,
219 bool is_yv12, size_t begin,
220 size_t end) {
221 95 YUVpToRGBA operation{is_yv12};
222 285 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
223 95 height, begin, end);
224 95 }
225
226 KLEIDICV_TARGET_FN_ATTRS
227 95 kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride,
228 uint8_t *dst, size_t dst_stride,
229 size_t width, size_t height,
230 bool is_yv12, size_t begin,
231 size_t end) {
232 95 YUVpToBGR operation{is_yv12};
233 285 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
234 95 height, begin, end);
235 95 }
236
237 KLEIDICV_TARGET_FN_ATTRS
238 95 kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride,
239 uint8_t *dst, size_t dst_stride,
240 size_t width, size_t height,
241 bool is_yv12, size_t begin,
242 size_t end) {
243 95 YUVpToBGRA operation{is_yv12};
244 285 return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
245 95 height, begin, end);
246 95 }
247 } // namespace kleidicv::neon
248