KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv_sc.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 170 170 100.0%
Functions: 76 76 100.0%
Branches: 60 60 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RGB_TO_YUV_SC_H
6 #define KLEIDICV_RGB_TO_YUV_SC_H
7
8 #include <limits>
9 #include <memory>
10
11 #include "kleidicv/conversions/rgb_to_yuv.h"
12 #include "kleidicv/kleidicv.h"
13 #include "kleidicv/sve2.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 template <bool BGR>
18 class RGBToYUVBase : public UnrollOnce {
19 public:
20 using ContextType = Context;
21 using ScalarType = uint8_t;
22 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
23
24 protected:
25 3612 void vector_calculation_path(svbool_t pg, svint16_t r_0, svint16_t r_1,
26 svint16_t g_0, svint16_t g_1, svint16_t b_0,
27 svint16_t b_1,
28 ScalarType *dst) KLEIDICV_STREAMING {
29 // Compute Y value in 32-bit precision
30 3612 svint16_t y_0, y_1;
31 {
32 3612 svint32_t y_00 = svmullb(r_0, kRYWeight);
33 3612 svint32_t y_01 = svmullb(r_1, kRYWeight);
34 3612 svint32_t y_10 = svmullt(r_0, kRYWeight);
35 3612 svint32_t y_11 = svmullt(r_1, kRYWeight);
36
37 3612 y_00 = svmlalb(y_00, g_0, kGYWeight);
38 3612 y_01 = svmlalb(y_01, g_1, kGYWeight);
39 3612 y_10 = svmlalt(y_10, g_0, kGYWeight);
40 3612 y_11 = svmlalt(y_11, g_1, kGYWeight);
41
42 3612 y_00 = svmlalb(y_00, b_0, kBYWeight);
43 3612 y_01 = svmlalb(y_01, b_1, kBYWeight);
44 3612 y_10 = svmlalt(y_10, b_0, kBYWeight);
45 3612 y_11 = svmlalt(y_11, b_1, kBYWeight);
46
47 3612 y_0 = combine_scaled_s16(y_00, y_10);
48 3612 y_1 = combine_scaled_s16(y_01, y_11);
49 3612 }
50
51 // Using the 16-bit Y value, calculate U
52 3612 svint16_t u_0, u_1;
53 {
54 3612 svint16_t uy_0 = svsub_x(VecTraits::svptrue(), b_0, y_0);
55 3612 svint16_t uy_1 = svsub_x(VecTraits::svptrue(), b_1, y_1);
56
57 3612 svint32_t u_00 = svdup_n_s32(half_);
58 3612 svint32_t u_01 = u_00;
59 3612 svint32_t u_10 = u_00;
60 3612 svint32_t u_11 = u_00;
61
62 3612 u_00 = svmlalb(u_00, uy_0, kBUWeight);
63 3612 u_01 = svmlalb(u_01, uy_1, kBUWeight);
64 3612 u_10 = svmlalt(u_10, uy_0, kBUWeight);
65 3612 u_11 = svmlalt(u_11, uy_1, kBUWeight);
66
67 3612 u_0 = combine_scaled_s16(u_00, u_10);
68 3612 u_1 = combine_scaled_s16(u_01, u_11);
69 3612 }
70
71 // Using the 16-bit Y value, calculate V
72 3612 svint16_t v_0, v_1;
73 {
74 3612 svint16_t vy_0 = svsub_x(VecTraits::svptrue(), r_0, y_0);
75 3612 svint16_t vy_1 = svsub_x(VecTraits::svptrue(), r_1, y_1);
76
77 3612 svint32_t v_00 = svdup_n_s32(half_);
78 3612 svint32_t v_10 = v_00;
79 3612 svint32_t v_01 = v_00;
80 3612 svint32_t v_11 = v_00;
81
82 3612 v_00 = svmlalb(v_00, vy_0, kRVWeight);
83 3612 v_01 = svmlalb(v_01, vy_1, kRVWeight);
84 3612 v_10 = svmlalt(v_10, vy_0, kRVWeight);
85 3612 v_11 = svmlalt(v_11, vy_1, kRVWeight);
86
87 3612 v_0 = combine_scaled_s16(v_00, v_10);
88 3612 v_1 = combine_scaled_s16(v_01, v_11);
89 3612 }
90
91 // Narrow the results to 8 bits
92 7224 svuint8x3_t yuv =
93 7224 svcreate3(svqxtunt(svqxtunb(y_0), y_1), svqxtunt(svqxtunb(u_0), u_1),
94 3612 svqxtunt(svqxtunb(v_0), v_1));
95
96 // Store interleaved YUV pixels to memory.
97 3612 svst3_u8(pg, dst, yuv);
98 3612 }
99
100 static constexpr size_t r_index_ = BGR ? 2 : 0;
101 static constexpr size_t g_index_ = 1;
102 static constexpr size_t b_index_ = BGR ? 0 : 2;
103 static constexpr uint32_t half_ =
104 (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
105 21672 static svint16_t combine_scaled_s16(svint32_t even,
106 svint32_t odd) KLEIDICV_STREAMING {
107 21672 return svqrshrnt(svqrshrnb(even, kWeightScale), odd, kWeightScale);
108 }
109 }; // end of class RGBToYUVBase<bool BGR>
110
111 // 3-channel input
112 template <bool BGR>
113 class RGBToYUV final : public RGBToYUVBase<BGR> {
114 public:
115 using ContextType = Context;
116 using ScalarType = uint8_t;
117 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
118
119 // Returns the number of channels in the output image.
120 462 static constexpr size_t input_channels() KLEIDICV_STREAMING { return 3; }
121
122 1806 void vector_path(ContextType ctx, const ScalarType *src,
123 ScalarType *dst) KLEIDICV_STREAMING {
124 1806 auto pg = ctx.predicate();
125 1806 svuint8x3_t svsrc = svld3(pg, src);
126 1806 svint16_t r_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, r_index_)));
127 1806 svint16_t r_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, r_index_)));
128 1806 svint16_t g_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, g_index_)));
129 1806 svint16_t g_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, g_index_)));
130 1806 svint16_t b_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, b_index_)));
131 1806 svint16_t b_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, b_index_)));
132 3612 RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1,
133 1806 dst);
134 1806 }
135
136 private:
137 static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_;
138 static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_;
139 static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_;
140 }; // end of class RGBToYUV<bool BGR>
141
142 // 4-channel input
143 template <bool BGR>
144 class RGBAToYUV final : public RGBToYUVBase<BGR>, public UsesTailPath {
145 public:
146 using ContextType = Context;
147 using ScalarType = uint8_t;
148 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
149 using VectorType = typename VecTraits::VectorType;
150 using Vector4Type = typename VecTraits::Vector4Type;
151
152 590 explicit RGBAToYUV(svuint8x4_t &sv4) KLEIDICV_STREAMING
153 590 : deinterleave16_indices_(sv4) {
154 // clang-format off
155 // From the unzipped RGBA -> RBRBRBRB..., take it apart to even and odd
156 // pixels, and widen it to 16bits. For that, we need these tables:
157 // 0, FF, 4, FF, 8, FF, 12. ... red0
158 // 1, FF, 5, FF, 9, FF, 13, ... blue0
159 // 2, FF, 6, FF, 10, FF, 14, ... red1
160 // 3, FF, 7, FF, 11, FF, 15, ... blue1
161 // clang-format on
162 590 deinterleave16_indices_ =
163 1180 svcreate4(svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004)),
164 590 svreinterpret_u8_u16(svindex_u16(0xFF01, 0x0004)),
165 590 svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004)),
166 590 svreinterpret_u8_u16(svindex_u16(0xFF03, 0x0004)));
167 590 }
168
169 // Returns the number of channels in the output image.
170 462 static constexpr size_t input_channels() KLEIDICV_STREAMING { return 4; }
171
172 1470 void vector_path(ContextType ctx, const ScalarType *src,
173 ScalarType *dst) KLEIDICV_STREAMING {
174 1470 auto pg = ctx.predicate();
175 #if KLEIDICV_TARGET_SME2
176 246 svcount_t p_counter = VecTraits::svptrue_c();
177 246 Vector4Type src_vect = svld1_x4(p_counter, src);
178 #else
179 1224 Vector4Type src_vect = common_load(pg, pg, pg, pg, src);
180 #endif
181 1470 common_vector_path(pg, src_vect, dst);
182 1470 }
183
184 336 void tail_path(ContextType ctx, const ScalarType *src,
185 ScalarType *dst) KLEIDICV_STREAMING {
186 336 auto pg = ctx.predicate();
187 336 svbool_t pg_0, pg_1, pg_2, pg_3;
188 336 VecTraits::make_consecutive_predicates(pg, pg_0, pg_1, pg_2, pg_3);
189 336 Vector4Type src_vect = common_load(pg_0, pg_1, pg_2, pg_3, src);
190 336 common_vector_path(pg, src_vect, dst);
191 336 }
192
193 private:
194 1806 void common_vector_path(svbool_t pg, Vector4Type src_vect,
195 ScalarType *dst) KLEIDICV_STREAMING {
196 1806 svint16_t r_0, r_1, g_0, g_1, b_0, b_1;
197
198 1806 VectorType src0 = svget4(src_vect, 0);
199 1806 VectorType src1 = svget4(src_vect, 1);
200 1806 VectorType src2 = svget4(src_vect, 2);
201 1806 VectorType src3 = svget4(src_vect, 3);
202
203 1806 VectorType rb_l = svuzp1_u8(src0, src1);
204 1806 VectorType rb_h = svuzp1_u8(src2, src3);
205 1806 VectorType ga_l = svuzp2_u8(src0, src1);
206 1806 VectorType ga_h = svuzp2_u8(src2, src3);
207
4/4
✓ Branch 0 taken 36 times.
✓ Branch 1 taken 867 times.
✓ Branch 2 taken 36 times.
✓ Branch 3 taken 867 times.
1806 if (KLEIDICV_UNLIKELY(svcntb() >= 256)) {
208 72 svuint8_t r, g, b;
209 if constexpr (BGR) {
210 36 b = svuzp1_u8(rb_l, rb_h);
211 36 r = svuzp2_u8(rb_l, rb_h);
212 } else {
213 36 r = svuzp1_u8(rb_l, rb_h);
214 36 b = svuzp2_u8(rb_l, rb_h);
215 }
216 72 g = svuzp1_u8(ga_l, ga_h);
217 72 r_0 = svreinterpret_s16_u16(svmovlb(r));
218 72 r_1 = svreinterpret_s16_u16(svmovlt(r));
219 72 g_0 = svreinterpret_s16_u16(svmovlb(g));
220 72 g_1 = svreinterpret_s16_u16(svmovlt(g));
221 72 b_0 = svreinterpret_s16_u16(svmovlb(b));
222 72 b_1 = svreinterpret_s16_u16(svmovlt(b));
223 72 } else {
224 1734 b_0 = svreinterpret_s16_u8(
225 3468 svtbl2(svcreate2(rb_l, rb_h),
226 1734 svget4(deinterleave16_indices_, b_index_ / 2)));
227 1734 b_1 = svreinterpret_s16_u8(
228 3468 svtbl2(svcreate2(rb_l, rb_h),
229 1734 svget4(deinterleave16_indices_, b_index_ / 2 + 2)));
230 1734 r_0 = svreinterpret_s16_u8(
231 3468 svtbl2(svcreate2(rb_l, rb_h),
232 1734 svget4(deinterleave16_indices_, r_index_ / 2)));
233 1734 r_1 = svreinterpret_s16_u8(
234 3468 svtbl2(svcreate2(rb_l, rb_h),
235 1734 svget4(deinterleave16_indices_, r_index_ / 2 + 2)));
236
237 1734 g_0 = svreinterpret_s16_u8(
238 1734 svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 0)));
239 1734 g_1 = svreinterpret_s16_u8(
240 1734 svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 2)));
241 }
242 3612 RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1,
243 1806 dst);
244 1806 }
245
246 1560 Vector4Type common_load(svbool_t pg_0, svbool_t pg_1, svbool_t pg_2,
247 svbool_t pg_3,
248 const ScalarType *src) KLEIDICV_STREAMING {
249 1560 VectorType src_0 = svld1(pg_0, &src[0]);
250 1560 VectorType src_1 = svld1_vnum(pg_1, &src[0], 1);
251 1560 VectorType src_2 = svld1_vnum(pg_2, &src[0], 2);
252 1560 VectorType src_3 = svld1_vnum(pg_3, &src[0], 3);
253 3120 return svcreate4(src_0, src_1, src_2, src_3);
254 1560 }
255
256 static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_;
257 static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_;
258 static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_;
259
260 svuint8x4_t &deinterleave16_indices_;
261 }; // end of class RGBAToYUV<bool BGR>
262
263 template <typename OperationType, typename ScalarType>
264 1180 kleidicv_error_t rgb2yuv_operation(OperationType operation,
265 const ScalarType *src, size_t src_stride,
266 ScalarType *dst, size_t dst_stride,
267 size_t width,
268 size_t height) KLEIDICV_STREAMING {
269
16/16
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 364 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 364 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 279 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 279 times.
✓ Branch 8 taken 16 times.
✓ Branch 9 taken 279 times.
✓ Branch 10 taken 16 times.
✓ Branch 11 taken 279 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 194 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 194 times.
1180 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
270
16/16
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 344 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 344 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 263 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 263 times.
✓ Branch 8 taken 16 times.
✓ Branch 9 taken 263 times.
✓ Branch 10 taken 16 times.
✓ Branch 11 taken 263 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 182 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 182 times.
1116 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
271
24/24
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 324 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 304 times.
✓ Branch 4 taken 40 times.
✓ Branch 5 taken 304 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 247 times.
✓ Branch 8 taken 16 times.
✓ Branch 9 taken 231 times.
✓ Branch 10 taken 32 times.
✓ Branch 11 taken 231 times.
✓ Branch 12 taken 16 times.
✓ Branch 13 taken 247 times.
✓ Branch 14 taken 16 times.
✓ Branch 15 taken 231 times.
✓ Branch 16 taken 32 times.
✓ Branch 17 taken 231 times.
✓ Branch 18 taken 12 times.
✓ Branch 19 taken 170 times.
✓ Branch 20 taken 12 times.
✓ Branch 21 taken 158 times.
✓ Branch 22 taken 24 times.
✓ Branch 23 taken 158 times.
1052 CHECK_IMAGE_SIZE(width, height);
272
273 924 Rectangle rect{width, height};
274 924 Rows src_rows{src, src_stride, operation.input_channels()};
275 924 Rows dst_rows{dst, dst_stride, 3};
276
277 924 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
278 924 return KLEIDICV_OK;
279 1180 }
280
281 #if !KLEIDICV_TARGET_SME2
282 KLEIDICV_TARGET_FN_ATTRS
283 295 static kleidicv_error_t rgb_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
284 uint8_t *dst, size_t dst_stride,
285 size_t width,
286 size_t height) KLEIDICV_STREAMING {
287 295 RGBToYUV<false> operation;
288 885 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
289 295 height);
290 295 }
291
292 KLEIDICV_TARGET_FN_ATTRS
293 295 static kleidicv_error_t bgr_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
294 uint8_t *dst, size_t dst_stride,
295 size_t width,
296 size_t height) KLEIDICV_STREAMING {
297 295 RGBToYUV<true> operation;
298 885 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
299 295 height);
300 295 }
301
302 #endif // !KLEIDICV_TARGET_SME2
303
304 KLEIDICV_TARGET_FN_ATTRS
305 295 static kleidicv_error_t rgba_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
306 uint8_t *dst, size_t dst_stride,
307 size_t width,
308 size_t height) KLEIDICV_STREAMING {
309 295 svuint8x4_t indices;
310 295 RGBAToYUV<false> operation(indices);
311 885 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
312 295 height);
313 295 }
314
315 KLEIDICV_TARGET_FN_ATTRS
316 295 static kleidicv_error_t bgra_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
317 uint8_t *dst, size_t dst_stride,
318 size_t width,
319 size_t height) KLEIDICV_STREAMING {
320 295 svuint8x4_t indices;
321 295 RGBAToYUV<true> operation(indices);
322 885 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
323 295 height);
324 295 }
325
326 } // namespace KLEIDICV_TARGET_NAMESPACE
327
328 #endif // KLEIDICV_RGB_TO_YUV_SC_H
329