KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv444_sc.h
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 170 170 100.0%
Functions: 75 75 100.0%
Branches: 65 65 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RGB_TO_YUV444_SC_H
6 #define KLEIDICV_RGB_TO_YUV444_SC_H
7
8 #include <limits>
9 #include <memory>
10
11 #include "kleidicv/conversions/rgb_to_yuv.h"
12 #include "kleidicv/kleidicv.h"
13 #include "kleidicv/sve2.h"
14 #include "rgb_to_yuv444_coefficients.h"
15
16 namespace KLEIDICV_TARGET_NAMESPACE {
17
18 template <bool BGR>
19 class RGBToYUVBase : public UnrollOnce {
20 public:
21 using ContextType = Context;
22 using ScalarType = uint8_t;
23 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
24
25 protected:
26 KLEIDICV_FORCE_INLINE
27 3920 void vector_calculation_path(svbool_t pg, svint16_t r_0, svint16_t r_1,
28 svint16_t g_0, svint16_t g_1, svint16_t b_0,
29 svint16_t b_1,
30 ScalarType *dst) KLEIDICV_STREAMING {
31 // Compute Y value in 32-bit precision
32 3920 svint16_t y_0, y_1;
33 {
34 3920 svint32_t y_00 = svmullb(r_0, kRYWeight);
35 3920 svint32_t y_01 = svmullb(r_1, kRYWeight);
36 3920 svint32_t y_10 = svmullt(r_0, kRYWeight);
37 3920 svint32_t y_11 = svmullt(r_1, kRYWeight);
38
39 3920 y_00 = svmlalb(y_00, g_0, kGYWeight);
40 3920 y_01 = svmlalb(y_01, g_1, kGYWeight);
41 3920 y_10 = svmlalt(y_10, g_0, kGYWeight);
42 3920 y_11 = svmlalt(y_11, g_1, kGYWeight);
43
44 3920 y_00 = svmlalb(y_00, b_0, kBYWeight);
45 3920 y_01 = svmlalb(y_01, b_1, kBYWeight);
46 3920 y_10 = svmlalt(y_10, b_0, kBYWeight);
47 3920 y_11 = svmlalt(y_11, b_1, kBYWeight);
48
49 3920 y_0 = combine_scaled_s16(y_00, y_10);
50 3920 y_1 = combine_scaled_s16(y_01, y_11);
51 3920 }
52
53 // Using the 16-bit Y value, calculate U
54 3920 svint16_t u_0, u_1;
55 {
56 3920 svint16_t uy_0 = svsub_x(VecTraits::svptrue(), b_0, y_0);
57 3920 svint16_t uy_1 = svsub_x(VecTraits::svptrue(), b_1, y_1);
58
59 3920 svint32_t u_00 = svdup_n_s32(half_);
60 3920 svint32_t u_01 = u_00;
61 3920 svint32_t u_10 = u_00;
62 3920 svint32_t u_11 = u_00;
63
64 3920 u_00 = svmlalb(u_00, uy_0, kBUWeight);
65 3920 u_01 = svmlalb(u_01, uy_1, kBUWeight);
66 3920 u_10 = svmlalt(u_10, uy_0, kBUWeight);
67 3920 u_11 = svmlalt(u_11, uy_1, kBUWeight);
68
69 3920 u_0 = combine_scaled_s16(u_00, u_10);
70 3920 u_1 = combine_scaled_s16(u_01, u_11);
71 3920 }
72
73 // Using the 16-bit Y value, calculate V
74 3920 svint16_t v_0, v_1;
75 {
76 3920 svint16_t vy_0 = svsub_x(VecTraits::svptrue(), r_0, y_0);
77 3920 svint16_t vy_1 = svsub_x(VecTraits::svptrue(), r_1, y_1);
78
79 3920 svint32_t v_00 = svdup_n_s32(half_);
80 3920 svint32_t v_10 = v_00;
81 3920 svint32_t v_01 = v_00;
82 3920 svint32_t v_11 = v_00;
83
84 3920 v_00 = svmlalb(v_00, vy_0, kRVWeight);
85 3920 v_01 = svmlalb(v_01, vy_1, kRVWeight);
86 3920 v_10 = svmlalt(v_10, vy_0, kRVWeight);
87 3920 v_11 = svmlalt(v_11, vy_1, kRVWeight);
88
89 3920 v_0 = combine_scaled_s16(v_00, v_10);
90 3920 v_1 = combine_scaled_s16(v_01, v_11);
91 3920 }
92
93 // Narrow the results to 8 bits
94 7840 svuint8x3_t yuv =
95 7840 svcreate3(svqxtunt(svqxtunb(y_0), y_1), svqxtunt(svqxtunb(u_0), u_1),
96 3920 svqxtunt(svqxtunb(v_0), v_1));
97
98 // Store interleaved YUV pixels to memory.
99 3920 svst3_u8(pg, dst, yuv);
100 3920 }
101
102 static constexpr size_t r_index_ = BGR ? 2 : 0;
103 static constexpr size_t g_index_ = 1;
104 static constexpr size_t b_index_ = BGR ? 0 : 2;
105 static constexpr uint32_t half_ =
106 (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
107 23520 static svint16_t combine_scaled_s16(svint32_t even,
108 svint32_t odd) KLEIDICV_STREAMING {
109 23520 return svqrshrnt(svqrshrnb(even, kWeightScale), odd, kWeightScale);
110 }
111 }; // end of class RGBToYUVBase<bool BGR>
112
113 // 3-channel input
114 template <bool BGR>
115 class RGBToYUV final : public RGBToYUVBase<BGR> {
116 public:
117 using ContextType = Context;
118 using ScalarType = uint8_t;
119 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
120
121 // Returns the number of channels in the output image.
122 520 static constexpr size_t input_channels() KLEIDICV_STREAMING { return 3; }
123
124 1960 void vector_path(ContextType ctx, const ScalarType *src,
125 ScalarType *dst) KLEIDICV_STREAMING {
126 1960 auto pg = ctx.predicate();
127 1960 svuint8x3_t svsrc = svld3(pg, src);
128 1960 svint16_t r_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, r_index_)));
129 1960 svint16_t r_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, r_index_)));
130 1960 svint16_t g_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, g_index_)));
131 1960 svint16_t g_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, g_index_)));
132 1960 svint16_t b_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, b_index_)));
133 1960 svint16_t b_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, b_index_)));
134 3920 RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1,
135 1960 dst);
136 1960 }
137
138 private:
139 static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_;
140 static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_;
141 static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_;
142 }; // end of class RGBToYUV<bool BGR>
143
144 // 4-channel input
145 template <bool BGR>
146 class RGBAToYUV final : public RGBToYUVBase<BGR>, public UsesTailPath {
147 public:
148 using ContextType = Context;
149 using ScalarType = uint8_t;
150 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
151 using VectorType = typename VecTraits::VectorType;
152 using Vector4Type = typename VecTraits::Vector4Type;
153
154 672 explicit RGBAToYUV(svuint8x4_t &sv4) KLEIDICV_STREAMING
155 672 : deinterleave16_indices_(sv4) {
156 // clang-format off
157 // From the unzipped RGBA -> RBRBRBRB..., take it apart to even and odd
158 // pixels, and widen it to 16bits. For that, we need these tables:
159 // 0, FF, 4, FF, 8, FF, 12. ... red0
160 // 1, FF, 5, FF, 9, FF, 13, ... blue0
161 // 2, FF, 6, FF, 10, FF, 14, ... red1
162 // 3, FF, 7, FF, 11, FF, 15, ... blue1
163 // clang-format on
164 672 deinterleave16_indices_ =
165 1344 svcreate4(svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004)),
166 672 svreinterpret_u8_u16(svindex_u16(0xFF01, 0x0004)),
167 672 svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004)),
168 672 svreinterpret_u8_u16(svindex_u16(0xFF03, 0x0004)));
169 672 }
170
171 // Returns the number of channels in the output image.
172 520 static constexpr size_t input_channels() KLEIDICV_STREAMING { return 4; }
173
174 KLEIDICV_FORCE_INLINE
175 1482 void vector_path(ContextType ctx, const ScalarType *src,
176 ScalarType *dst) KLEIDICV_STREAMING {
177 1482 auto pg = ctx.predicate();
178 #if KLEIDICV_TARGET_SME2
179 246 svcount_t p_counter = VecTraits::svptrue_c();
180 246 Vector4Type src_vect = svld1_x4(p_counter, src);
181 #else
182 1236 Vector4Type src_vect = common_load(pg, pg, pg, pg, src);
183 #endif
184 1482 common_vector_path(pg, src_vect, dst);
185 1482 }
186
187 478 void tail_path(ContextType ctx, const ScalarType *src,
188 ScalarType *dst) KLEIDICV_STREAMING {
189 478 auto pg = ctx.predicate();
190 478 svbool_t pg_0, pg_1, pg_2, pg_3;
191 478 VecTraits::make_consecutive_predicates(pg, pg_0, pg_1, pg_2, pg_3);
192 478 Vector4Type src_vect = common_load(pg_0, pg_1, pg_2, pg_3, src);
193 478 common_vector_path(pg, src_vect, dst);
194 478 }
195
196 private:
197 KLEIDICV_FORCE_INLINE
198 1960 void common_vector_path(svbool_t pg, Vector4Type src_vect,
199 ScalarType *dst) KLEIDICV_STREAMING {
200 1960 svint16_t r_0, r_1, g_0, g_1, b_0, b_1;
201
202 1960 VectorType src0 = svget4(src_vect, 0);
203 1960 VectorType src1 = svget4(src_vect, 1);
204 1960 VectorType src2 = svget4(src_vect, 2);
205 1960 VectorType src3 = svget4(src_vect, 3);
206
207 1960 VectorType rb_l = svuzp1_u8(src0, src1);
208 1960 VectorType rb_h = svuzp1_u8(src2, src3);
209 1960 VectorType ga_l = svuzp2_u8(src0, src1);
210 1960 VectorType ga_h = svuzp2_u8(src2, src3);
211
4/4
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 876 times.
✓ Branch 2 taken 104 times.
✓ Branch 3 taken 876 times.
1960 if (KLEIDICV_UNLIKELY(svcntb() >= 256)) {
212 208 svuint8_t r, g, b;
213 if constexpr (BGR) {
214 104 b = svuzp1_u8(rb_l, rb_h);
215 104 r = svuzp2_u8(rb_l, rb_h);
216 } else {
217 104 r = svuzp1_u8(rb_l, rb_h);
218 104 b = svuzp2_u8(rb_l, rb_h);
219 }
220 208 g = svuzp1_u8(ga_l, ga_h);
221 208 r_0 = svreinterpret_s16_u16(svmovlb(r));
222 208 r_1 = svreinterpret_s16_u16(svmovlt(r));
223 208 g_0 = svreinterpret_s16_u16(svmovlb(g));
224 208 g_1 = svreinterpret_s16_u16(svmovlt(g));
225 208 b_0 = svreinterpret_s16_u16(svmovlb(b));
226 208 b_1 = svreinterpret_s16_u16(svmovlt(b));
227 208 } else {
228 1752 b_0 = svreinterpret_s16_u8(
229 3504 svtbl2(svcreate2(rb_l, rb_h),
230 1752 svget4(deinterleave16_indices_, b_index_ / 2)));
231 1752 b_1 = svreinterpret_s16_u8(
232 3504 svtbl2(svcreate2(rb_l, rb_h),
233 1752 svget4(deinterleave16_indices_, b_index_ / 2 + 2)));
234 1752 r_0 = svreinterpret_s16_u8(
235 3504 svtbl2(svcreate2(rb_l, rb_h),
236 1752 svget4(deinterleave16_indices_, r_index_ / 2)));
237 1752 r_1 = svreinterpret_s16_u8(
238 3504 svtbl2(svcreate2(rb_l, rb_h),
239 1752 svget4(deinterleave16_indices_, r_index_ / 2 + 2)));
240
241 1752 g_0 = svreinterpret_s16_u8(
242 1752 svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 0)));
243 1752 g_1 = svreinterpret_s16_u8(
244 1752 svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 2)));
245 }
246 3920 RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1,
247 1960 dst);
248 1960 }
249
250 1714 Vector4Type common_load(svbool_t pg_0, svbool_t pg_1, svbool_t pg_2,
251 svbool_t pg_3,
252 const ScalarType *src) KLEIDICV_STREAMING {
253 1714 VectorType src_0 = svld1(pg_0, &src[0]);
254 1714 VectorType src_1 = svld1_vnum(pg_1, &src[0], 1);
255 1714 VectorType src_2 = svld1_vnum(pg_2, &src[0], 2);
256 1714 VectorType src_3 = svld1_vnum(pg_3, &src[0], 3);
257 3428 return svcreate4(src_0, src_1, src_2, src_3);
258 1714 }
259
260 static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_;
261 static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_;
262 static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_;
263
264 svuint8x4_t &deinterleave16_indices_;
265 }; // end of class RGBAToYUV<bool BGR>
266
267 template <typename OperationType, typename ScalarType>
268 1344 kleidicv_error_t rgb2yuv_operation(OperationType operation,
269 const ScalarType *src, size_t src_stride,
270 ScalarType *dst, size_t dst_stride,
271 size_t width,
272 size_t height) KLEIDICV_STREAMING {
273
16/16
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 320 times.
✓ Branch 2 taken 16 times.
✓ Branch 3 taken 320 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 320 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 320 times.
✓ Branch 8 taken 16 times.
✓ Branch 9 taken 320 times.
✓ Branch 10 taken 16 times.
✓ Branch 11 taken 320 times.
✓ Branch 12 taken 16 times.
✓ Branch 13 taken 320 times.
✓ Branch 14 taken 16 times.
✓ Branch 15 taken 320 times.
1344 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
274
16/16
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 304 times.
✓ Branch 2 taken 16 times.
✓ Branch 3 taken 304 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 304 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 304 times.
✓ Branch 8 taken 16 times.
✓ Branch 9 taken 304 times.
✓ Branch 10 taken 16 times.
✓ Branch 11 taken 304 times.
✓ Branch 12 taken 16 times.
✓ Branch 13 taken 304 times.
✓ Branch 14 taken 16 times.
✓ Branch 15 taken 304 times.
1280 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
275
24/24
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 280 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 260 times.
✓ Branch 4 taken 44 times.
✓ Branch 5 taken 260 times.
✓ Branch 6 taken 24 times.
✓ Branch 7 taken 280 times.
✓ Branch 8 taken 20 times.
✓ Branch 9 taken 260 times.
✓ Branch 10 taken 44 times.
✓ Branch 11 taken 260 times.
✓ Branch 12 taken 24 times.
✓ Branch 13 taken 280 times.
✓ Branch 14 taken 20 times.
✓ Branch 15 taken 260 times.
✓ Branch 16 taken 44 times.
✓ Branch 17 taken 260 times.
✓ Branch 18 taken 24 times.
✓ Branch 19 taken 280 times.
✓ Branch 20 taken 20 times.
✓ Branch 21 taken 260 times.
✓ Branch 22 taken 44 times.
✓ Branch 23 taken 260 times.
1216 CHECK_IMAGE_SIZE(width, height);
276
277 1040 Rectangle rect{width, height};
278 1040 Rows src_rows{src, src_stride, operation.input_channels()};
279 1040 Rows dst_rows{dst, dst_stride, 3};
280
281 1040 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
282 1040 return KLEIDICV_OK;
283 1344 }
284
285 KLEIDICV_TARGET_FN_ATTRS
286 1440 static kleidicv_error_t rgb_to_yuv444_u8_sc(
287 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
288 size_t width, size_t height,
289 kleidicv_color_conversion_t color_format) KLEIDICV_STREAMING {
290
5/5
✓ Branch 0 taken 336 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 336 times.
✓ Branch 3 taken 336 times.
✓ Branch 4 taken 336 times.
1440 switch (color_format) {
291 case KLEIDICV_RGB_TO_YUV444: {
292 336 RGBToYUV<false> operation;
293 672 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
294 336 width, height);
295 336 }
296
297 case KLEIDICV_BGR_TO_YUV444: {
298 336 RGBToYUV<true> operation;
299 672 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
300 336 width, height);
301 336 }
302
303 case KLEIDICV_RGBA_TO_YUV444: {
304 336 svuint8x4_t indices;
305 336 RGBAToYUV<false> operation(indices);
306 672 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
307 336 width, height);
308 336 }
309
310 case KLEIDICV_BGRA_TO_YUV444: {
311 336 svuint8x4_t indices;
312 336 RGBAToYUV<true> operation(indices);
313 672 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
314 336 width, height);
315 336 }
316
317 default:
318 96 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
319 }
320
321 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
322 1440 }
323
324 } // namespace KLEIDICV_TARGET_NAMESPACE
325
326 #endif // KLEIDICV_RGB_TO_YUV444_SC_H
327