KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv422_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 173 173 100.0%
Functions: 109 109 100.0%
Branches: 127 127 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cstddef>
6 #include <utility>
7
8 #include "kleidicv/conversions/rgb_to_yuv.h"
9 #include "kleidicv/kleidicv.h"
10 #include "kleidicv/neon.h"
11
12 namespace kleidicv::neon {
13
14 static const int kWeightScale = 14;
15
16 static const int KR2Y422Weight = 4211; // 0.299077 * (236 - 16) / 256 * 16384
17 static const int KG2Y422Weight = 8258; // 0.586506 * (236 - 16) / 256 * 16384
18 static const int KB2Y422Weight = 1606; // 0.114062 * (236 - 16) / 256 * 16384
19
20 static const int KR2U422Weight = -1212; // -0.148 * 8192
21 static const int KG2U422Weight = -2384; // -0.291 * 8192
22 static const int KB2U422Weight = 3596; // 0.439 * 8192
23 static const int KG2V422Weight = -3015; // -0.368 * 8192
24 static const int KB2V422Weight = -582; // -0.071 * 8192
25
26 template <size_t b_idx, size_t u_idx, size_t y_idx, size_t scn>
27 class RGBxOrBGRxToYUV422 {
28 public:
29 static constexpr size_t r_idx = 2 - b_idx;
30 static constexpr size_t v_idx = (u_idx + 2) % 4;
31
32 168 static kleidicv_error_t rgbx2yuv422_operation(const uint8_t* src,
33 size_t src_stride, uint8_t* dst,
34 size_t dst_stride, size_t width,
35 size_t height) {
36 // Destination channel count (dcn = 2) because YUV422 is interleaved with
37 // two channels per pixel on average: one luma (Y) and one shared
38 // chroma (U or V). Thus, dcn is set to 2 for this color format.
39 168 constexpr size_t dcn = 2;
40
41 // Loop through rows along the image height.
42
24/24
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 53 times.
✓ Branch 7 taken 2296 times.
✓ Branch 8 taken 53 times.
✓ Branch 9 taken 2296 times.
✓ Branch 10 taken 53 times.
✓ Branch 11 taken 2296 times.
✓ Branch 12 taken 1 times.
✓ Branch 13 taken 8 times.
✓ Branch 14 taken 1 times.
✓ Branch 15 taken 8 times.
✓ Branch 16 taken 1 times.
✓ Branch 17 taken 8 times.
✓ Branch 18 taken 1 times.
✓ Branch 19 taken 8 times.
✓ Branch 20 taken 1 times.
✓ Branch 21 taken 8 times.
✓ Branch 22 taken 1 times.
✓ Branch 23 taken 8 times.
7128 for (size_t h = 0; h < height; h++, src += src_stride) {
43 // Keep track of the current output row being written.
44 6960 Columns<uint8_t> dst_row{dst + dst_stride * h, dcn};
45 6960 LoopUnroll2 loop{width, kVectorLength};
46
47 // Unroll by 2: convert two RGB pixels → one YVYU pair [Y0, V0, Y1, U0].
48 // Compute Y0 and Y1 per pixel; compute V0/U0 once for the pair (shared
49 // chroma); then pack as [Y0, V0, Y1, U0] each iteration for higher
50 // throughput.
51 7344 loop.unroll_twice([&](size_t index) {
52 384 uint8x16_t r0, g0, b0, r1, g1, b1;
53 if constexpr (scn == 4) {
54 192 uint8x16x4_t rgbx_0 = vld4q_u8(src + index * scn);
55 192 uint8x16x4_t rgbx_1 =
56 192 vld4q_u8(src + index * scn + kVectorLength * scn);
57
58 192 r0 = rgbx_0.val[r_idx], g0 = rgbx_0.val[1], b0 = rgbx_0.val[b_idx];
59 192 r1 = rgbx_1.val[r_idx], g1 = rgbx_1.val[1], b1 = rgbx_1.val[b_idx];
60 192 } else {
61 192 uint8x16x3_t rgbx_0 = vld3q_u8(src + index * scn);
62 192 uint8x16x3_t rgbx_1 =
63 192 vld3q_u8(src + index * scn + kVectorLength * scn);
64
65 192 r0 = rgbx_0.val[r_idx], g0 = rgbx_0.val[1], b0 = rgbx_0.val[b_idx];
66 192 r1 = rgbx_1.val[r_idx], g1 = rgbx_1.val[1], b1 = rgbx_1.val[b_idx];
67 192 }
68
69 768 rgb_to_yuv422(r0, g0, b0, r1, g1, b1,
70 384 dst_row.ptr_at(static_cast<ptrdiff_t>(index)));
71 384 });
72
73 13920 loop.remaining([&](size_t index, size_t length) {
74
24/24
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 3550 times.
✓ Branch 7 taken 2296 times.
✓ Branch 8 taken 3550 times.
✓ Branch 9 taken 2296 times.
✓ Branch 10 taken 3550 times.
✓ Branch 11 taken 2296 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 8 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 8 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 8 times.
✓ Branch 18 taken 8 times.
✓ Branch 19 taken 8 times.
✓ Branch 20 taken 8 times.
✓ Branch 21 taken 8 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 8 times.
17682 for (; index < length; index += 2) {
75 21444 const uint8_t r1 = src[index * scn + r_idx],
76 10722 g1 = src[index * scn + 1],
77 10722 b1 = src[index * scn + b_idx];
78
79 21444 const uint8_t r2 = src[index * scn + scn + r_idx],
80 10722 g2 = src[index * scn + scn + 1],
81 10722 b2 = src[index * scn + scn + b_idx];
82
83 21444 rgb_to_yuv422(r1, g1, b1, r2, g2, b2,
84 10722 dst_row.ptr_at(static_cast<ptrdiff_t>(index)));
85 10722 }
86 6960 });
87 6960 }
88 168 return KLEIDICV_OK;
89 168 }
90
91 private:
92 21444 static uint8_t compute_y(const uint8_t r, const uint8_t g, const uint8_t b) {
93 21444 int y_ = r * KR2Y422Weight + g * KG2Y422Weight + b * KB2Y422Weight +
94 (1 << kWeightScale) * 16;
95 42888 return saturating_cast<int, uint8_t>(((1 << (kWeightScale - 1)) + y_) >>
96 kWeightScale);
97 21444 }
98
99 10722 static std::pair<uint8_t, uint8_t> compute_uv(
100 const uint8_t r1, const uint8_t g1, const uint8_t b1, const uint8_t r2,
101 const uint8_t g2, const uint8_t b2) {
102 10722 int sr = r1 + r2, sg = g1 + g2, sb = b1 + b2;
103
104 10722 int u_ = sr * KR2U422Weight + sg * KG2U422Weight + sb * KB2U422Weight +
105 (1 << (kWeightScale - 1)) * 256;
106 21444 uint8_t u = saturating_cast<int, uint8_t>(
107 10722 ((1 << (kWeightScale - 1)) + u_) >> kWeightScale);
108
109 10722 int v_ = sr * KB2U422Weight + sg * KG2V422Weight + sb * KB2V422Weight +
110 (1 << (kWeightScale - 1)) * 256;
111 21444 uint8_t v = saturating_cast<int, uint8_t>(
112 10722 ((1 << (kWeightScale - 1)) + v_) >> kWeightScale);
113 10722 return {u, v};
114 10722 }
115
116 KLEIDICV_FORCE_INLINE
117 10722 static void rgb_to_yuv422(const uint8_t r0, const uint8_t g0,
118 const uint8_t b0, const uint8_t r1,
119 const uint8_t g1, const uint8_t b1,
120 uint8_t* dst_ptr) {
121 10722 auto y0 = compute_y(r0, g0, b0);
122 10722 auto y1 = compute_y(r1, g1, b1);
123 10722 auto [u, v] = compute_uv(r0, g0, b0, r1, g1, b1);
124
125 10722 dst_ptr[y_idx] = y0;
126 10722 dst_ptr[y_idx + 2] = y1;
127 10722 dst_ptr[u_idx] = u;
128 10722 dst_ptr[v_idx] = v;
129 10722 }
130
131 384 static inline void rgb_to_yuv422(const uint8x16_t r0, const uint8x16_t g0,
132 const uint8x16_t b0, const uint8x16_t r1,
133 const uint8x16_t g1, const uint8x16_t b1,
134 uint8_t* dst_ptr) {
135 384 int y_base = (1 << (kWeightScale - 1)) + (1 << kWeightScale) * 16;
136 384 int uv_bias = (1 << (kWeightScale - 1)) + (1 << (kWeightScale - 1)) * 256;
137
138 384 int16x8_t r_even[2];
139 384 int16x8_t g_even[2];
140 384 int16x8_t b_even[2];
141
142 384 const uint8x16_t index_even = {0, 0xff, 2, 0xff, 4, 0xff, 6, 0xff,
143 8, 0xff, 10, 0xff, 12, 0xff, 14, 0xff};
144
145 384 const uint8x16_t index_odd = {1, 0xff, 3, 0xff, 5, 0xff, 7, 0xff,
146 9, 0xff, 11, 0xff, 13, 0xff, 15, 0xff};
147
148 384 r_even[0] = vreinterpretq_s16_u8(vqtbl1q_u8(r0, index_even));
149 384 r_even[1] = vreinterpretq_s16_u8(vqtbl1q_u8(r1, index_even));
150
151 384 g_even[0] = vreinterpretq_s16_u8(vqtbl1q_u8(g0, index_even));
152 384 g_even[1] = vreinterpretq_s16_u8(vqtbl1q_u8(g1, index_even));
153
154 384 b_even[0] = vreinterpretq_s16_u8(vqtbl1q_u8(b0, index_even));
155 384 b_even[1] = vreinterpretq_s16_u8(vqtbl1q_u8(b1, index_even));
156
157 768 uint8x16_t y1 =
158 768 compute_weighted_channel_422(r_even, g_even, b_even, KR2Y422Weight,
159 384 KG2Y422Weight, KB2Y422Weight, y_base);
160
161 384 int16x8_t r_odd[2];
162 384 int16x8_t g_odd[2];
163 384 int16x8_t b_odd[2];
164
165 384 r_odd[0] = vreinterpretq_s16_u8(vqtbl1q_u8(r0, index_odd));
166 384 r_odd[1] = vreinterpretq_s16_u8(vqtbl1q_u8(r1, index_odd));
167
168 384 g_odd[0] = vreinterpretq_s16_u8(vqtbl1q_u8(g0, index_odd));
169 384 g_odd[1] = vreinterpretq_s16_u8(vqtbl1q_u8(g1, index_odd));
170
171 384 b_odd[0] = vreinterpretq_s16_u8(vqtbl1q_u8(b0, index_odd));
172 384 b_odd[1] = vreinterpretq_s16_u8(vqtbl1q_u8(b1, index_odd));
173
174 768 uint8x16_t y2 =
175 768 compute_weighted_channel_422(r_odd, g_odd, b_odd, KR2Y422Weight,
176 384 KG2Y422Weight, KB2Y422Weight, y_base);
177
178 384 int16x8_t r_avg[2];
179 384 int16x8_t g_avg[2];
180 384 int16x8_t b_avg[2];
181
182 384 r_avg[0] = vaddq_s16(r_even[0], r_odd[0]);
183 384 r_avg[1] = vaddq_s16(r_even[1], r_odd[1]);
184 384 g_avg[0] = vaddq_s16(g_even[0], g_odd[0]);
185 384 g_avg[1] = vaddq_s16(g_even[1], g_odd[1]);
186 384 b_avg[0] = vaddq_s16(b_even[0], b_odd[0]);
187 384 b_avg[1] = vaddq_s16(b_even[1], b_odd[1]);
188
189 768 uint8x16_t u =
190 768 compute_weighted_channel_422(r_avg, g_avg, b_avg, KR2U422Weight,
191 384 KG2U422Weight, KB2U422Weight, uv_bias);
192 768 uint8x16_t v =
193 768 compute_weighted_channel_422(r_avg, g_avg, b_avg, KB2U422Weight,
194 384 KG2V422Weight, KB2V422Weight, uv_bias);
195
196 384 uint8x16x4_t yuv422;
197
198 384 yuv422.val[u_idx] = u;
199 384 yuv422.val[v_idx] = v;
200 384 yuv422.val[y_idx] = y1, yuv422.val[y_idx + 2] = y2;
201
202 384 vst4q_u8(dst_ptr, yuv422);
203 384 }
204
205 1536 static inline uint8x16_t compute_weighted_channel_422(
206 int16x8_t r[2], int16x8_t g[2], int16x8_t b[2], int16_t r_coeff,
207 int16_t g_coeff, int16_t b_coeff, int fixed) {
208 1536 int32x4_t bias = vdupq_n_s32(fixed);
209 1536 int32x4_t acc_lo[2] = {bias, bias};
210 1536 int32x4_t acc_hi[2] = {bias, bias};
211
212 KLEIDICV_FORCE_LOOP_UNROLL
213
24/24
✓ Branch 0 taken 256 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 256 times.
✓ Branch 3 taken 128 times.
✓ Branch 4 taken 256 times.
✓ Branch 5 taken 128 times.
✓ Branch 6 taken 256 times.
✓ Branch 7 taken 128 times.
✓ Branch 8 taken 256 times.
✓ Branch 9 taken 128 times.
✓ Branch 10 taken 256 times.
✓ Branch 11 taken 128 times.
✓ Branch 12 taken 256 times.
✓ Branch 13 taken 128 times.
✓ Branch 14 taken 256 times.
✓ Branch 15 taken 128 times.
✓ Branch 16 taken 256 times.
✓ Branch 17 taken 128 times.
✓ Branch 18 taken 256 times.
✓ Branch 19 taken 128 times.
✓ Branch 20 taken 256 times.
✓ Branch 21 taken 128 times.
✓ Branch 22 taken 256 times.
✓ Branch 23 taken 128 times.
4608 for (int i = 0; i < 2; i++) {
214 // R contributions
215 3072 acc_lo[i] = vmlal_n_s16(bias, vget_low(r[i]), r_coeff);
216 // G contributions
217 3072 acc_lo[i] = vmlal_n_s16(acc_lo[i], vget_low(g[i]), g_coeff);
218 // B contributions
219 3072 int32x4_t tmp = vmull_n_s16(vget_low(b[i]), b_coeff);
220 3072 acc_lo[i] = vaddq(acc_lo[i], tmp);
221 3072 }
222
223 KLEIDICV_FORCE_LOOP_UNROLL
224
24/24
✓ Branch 0 taken 128 times.
✓ Branch 1 taken 256 times.
✓ Branch 2 taken 128 times.
✓ Branch 3 taken 256 times.
✓ Branch 4 taken 128 times.
✓ Branch 5 taken 256 times.
✓ Branch 6 taken 128 times.
✓ Branch 7 taken 256 times.
✓ Branch 8 taken 128 times.
✓ Branch 9 taken 256 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 256 times.
✓ Branch 12 taken 128 times.
✓ Branch 13 taken 256 times.
✓ Branch 14 taken 128 times.
✓ Branch 15 taken 256 times.
✓ Branch 16 taken 128 times.
✓ Branch 17 taken 256 times.
✓ Branch 18 taken 128 times.
✓ Branch 19 taken 256 times.
✓ Branch 20 taken 128 times.
✓ Branch 21 taken 256 times.
✓ Branch 22 taken 128 times.
✓ Branch 23 taken 256 times.
4608 for (int i = 0; i < 2; i++) {
225 // R contributions
226 3072 acc_hi[i] = vmlal_high_n_s16(bias, r[i], r_coeff);
227 // G contributions
228 3072 acc_hi[i] = vmlal_high_n_s16(acc_hi[i], g[i], g_coeff);
229 // B contributions
230 3072 int32x4_t tmp = vmull_high_n_s16(b[i], b_coeff);
231 3072 acc_hi[i] = vaddq(acc_hi[i], tmp);
232 3072 }
233
234 3072 return normalize_and_pack(acc_lo[0], acc_hi[0], acc_lo[1], acc_hi[1]);
235 1536 }
236
237 KLEIDICV_FORCE_INLINE
238 1536 static uint8x16_t normalize_and_pack(int32x4_t vec_0, int32x4_t vec_1,
239 int32x4_t vec_2, int32x4_t vec_3) {
240 1536 uint8x16_t index = {0, 2, 4, 6, 8, 10, 12, 14,
241 16, 18, 20, 22, 24, 26, 28, 30};
242 1536 int16x4_t tmp_lo_lo = vqshrun_n_s32(vec_0, kWeightScale);
243 1536 int16x8_t tmp_lo_hi = vqshrun_high_n_s32(tmp_lo_lo, vec_1, kWeightScale);
244 1536 int16x4_t tmp_hi_lo = vqshrun_n_s32(vec_2, kWeightScale);
245 1536 int16x8_t tmp_hi_hi = vqshrun_high_n_s32(tmp_hi_lo, vec_3, kWeightScale);
246
247 1536 uint8x16x2_t tmp;
248 1536 tmp.val[0] = vreinterpretq_u8(tmp_lo_hi); // 0, 1, 2, 3, 4, 5, 6, 7
249 1536 tmp.val[1] = vreinterpretq_u8(tmp_hi_hi); // 8, 6, 10, 11, 12, 13, 14,
250
251 1536 uint8x16_t output = vqtbl2q_u8(tmp, index);
252
253 3072 return output;
254 1536 }
255 };
256
257 KLEIDICV_TARGET_FN_ATTRS
258 196 kleidicv_error_t rgb_to_yuv422_u8(const uint8_t* src, size_t src_stride,
259 uint8_t* dst, size_t dst_stride, size_t width,
260 size_t height,
261 kleidicv_color_conversion_t color_format) {
262
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 195 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 195 times.
196 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
263
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 194 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 194 times.
195 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
264
6/6
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 187 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 183 times.
✓ Branch 4 taken 11 times.
✓ Branch 5 taken 183 times.
194 CHECK_IMAGE_SIZE(width, height);
265
266 // YUV422 packs pixels in pairs: (Y0, U, Y1, V).
267 // Therefore, the image width must be at least 2 and always even.
268
4/4
✓ Branch 0 taken 179 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 175 times.
183 if (width < 2 || (width % 2) != 0) {
269 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
270 }
271
272
13/13
✓ Branch 0 taken 53 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 1 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 1 times.
✓ Branch 6 taken 53 times.
✓ Branch 7 taken 53 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 1 times.
✓ Branch 10 taken 1 times.
✓ Branch 11 taken 1 times.
✓ Branch 12 taken 1 times.
175 switch (color_format) {
273 case KLEIDICV_BGR_TO_YUYV:
274 1 return RGBxOrBGRxToYUV422<0, 1, 0, 3>::rgbx2yuv422_operation(
275 1 src, src_stride, dst, dst_stride, width, height);
276 break;
277 case KLEIDICV_BGR_TO_UYVY:
278 1 return RGBxOrBGRxToYUV422<0, 0, 1, 3>::rgbx2yuv422_operation(
279 1 src, src_stride, dst, dst_stride, width, height);
280 break;
281 case KLEIDICV_BGR_TO_YVYU:
282 1 return RGBxOrBGRxToYUV422<0, 3, 0, 3>::rgbx2yuv422_operation(
283 1 src, src_stride, dst, dst_stride, width, height);
284 break;
285 case KLEIDICV_RGB_TO_YUYV:
286 53 return RGBxOrBGRxToYUV422<2, 1, 0, 3>::rgbx2yuv422_operation(
287 53 src, src_stride, dst, dst_stride, width, height);
288 break;
289 case KLEIDICV_RGB_TO_UYVY:
290 53 return RGBxOrBGRxToYUV422<2, 0, 1, 3>::rgbx2yuv422_operation(
291 53 src, src_stride, dst, dst_stride, width, height);
292 break;
293 case KLEIDICV_RGB_TO_YVYU:
294 53 return RGBxOrBGRxToYUV422<2, 3, 0, 3>::rgbx2yuv422_operation(
295 53 src, src_stride, dst, dst_stride, width, height);
296 break;
297 case KLEIDICV_BGRA_TO_YUYV:
298 1 return RGBxOrBGRxToYUV422<0, 1, 0, 4>::rgbx2yuv422_operation(
299 1 src, src_stride, dst, dst_stride, width, height);
300 break;
301 case KLEIDICV_BGRA_TO_UYVY:
302 1 return RGBxOrBGRxToYUV422<0, 0, 1, 4>::rgbx2yuv422_operation(
303 1 src, src_stride, dst, dst_stride, width, height);
304 break;
305 case KLEIDICV_BGRA_TO_YVYU:
306 1 return RGBxOrBGRxToYUV422<0, 3, 0, 4>::rgbx2yuv422_operation(
307 1 src, src_stride, dst, dst_stride, width, height);
308 break;
309 case KLEIDICV_RGBA_TO_YUYV:
310 1 return RGBxOrBGRxToYUV422<2, 1, 0, 4>::rgbx2yuv422_operation(
311 1 src, src_stride, dst, dst_stride, width, height);
312 break;
313 case KLEIDICV_RGBA_TO_UYVY:
314 1 return RGBxOrBGRxToYUV422<2, 0, 1, 4>::rgbx2yuv422_operation(
315 1 src, src_stride, dst, dst_stride, width, height);
316 break;
317 case KLEIDICV_RGBA_TO_YVYU:
318 1 return RGBxOrBGRxToYUV422<2, 3, 0, 4>::rgbx2yuv422_operation(
319 1 src, src_stride, dst, dst_stride, width, height);
320 break;
321 default:
322 7 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
323 break;
324 }
325
326 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
327 196 }
328
329 } // namespace kleidicv::neon
330