KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv_sc.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 159 159 100.0%
Functions: 52 52 100.0%
Branches: 60 60 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RGB_TO_YUV_SC_H
6 #define KLEIDICV_RGB_TO_YUV_SC_H
7
8 #include <limits>
9 #include <memory>
10
11 #include "kleidicv/conversions/rgb_to_yuv.h"
12 #include "kleidicv/kleidicv.h"
13 #include "kleidicv/sve2.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 template <bool BGR>
18 class RGBToYUVBase : public UnrollOnce {
19 public:
20 using ContextType = Context;
21 using ScalarType = uint8_t;
22 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
23
24 protected:
25 2764 void vector_calculation_path(svbool_t pg, svint16_t r_0, svint16_t r_1,
26 svint16_t g_0, svint16_t g_1, svint16_t b_0,
27 svint16_t b_1,
28 ScalarType *dst) KLEIDICV_STREAMING {
29 // Compute Y value in 32-bit precision
30 2764 svint16_t y_0, y_1;
31 {
32 2764 svint32_t y_00 = svmullb(r_0, kRYWeight);
33 2764 svint32_t y_01 = svmullb(r_1, kRYWeight);
34 2764 svint32_t y_10 = svmullt(r_0, kRYWeight);
35 2764 svint32_t y_11 = svmullt(r_1, kRYWeight);
36
37 2764 y_00 = svmlalb(y_00, g_0, kGYWeight);
38 2764 y_01 = svmlalb(y_01, g_1, kGYWeight);
39 2764 y_10 = svmlalt(y_10, g_0, kGYWeight);
40 2764 y_11 = svmlalt(y_11, g_1, kGYWeight);
41
42 2764 y_00 = svmlalb(y_00, b_0, kBYWeight);
43 2764 y_01 = svmlalb(y_01, b_1, kBYWeight);
44 2764 y_10 = svmlalt(y_10, b_0, kBYWeight);
45 2764 y_11 = svmlalt(y_11, b_1, kBYWeight);
46
47 2764 y_0 = combine_scaled_s16(y_00, y_10);
48 2764 y_1 = combine_scaled_s16(y_01, y_11);
49 2764 }
50
51 // Using the 16-bit Y value, calculate U
52 2764 svint16_t u_0, u_1;
53 {
54 2764 svint16_t uy_0 = svsub_x(VecTraits::svptrue(), b_0, y_0);
55 2764 svint16_t uy_1 = svsub_x(VecTraits::svptrue(), b_1, y_1);
56
57 2764 svint32_t u_00 = svdup_n_s32(half_);
58 2764 svint32_t u_01 = u_00;
59 2764 svint32_t u_10 = u_00;
60 2764 svint32_t u_11 = u_00;
61
62 2764 u_00 = svmlalb(u_00, uy_0, kBUWeight);
63 2764 u_01 = svmlalb(u_01, uy_1, kBUWeight);
64 2764 u_10 = svmlalt(u_10, uy_0, kBUWeight);
65 2764 u_11 = svmlalt(u_11, uy_1, kBUWeight);
66
67 2764 u_0 = combine_scaled_s16(u_00, u_10);
68 2764 u_1 = combine_scaled_s16(u_01, u_11);
69 2764 }
70
71 // Using the 16-bit Y value, calculate V
72 2764 svint16_t v_0, v_1;
73 {
74 2764 svint16_t vy_0 = svsub_x(VecTraits::svptrue(), r_0, y_0);
75 2764 svint16_t vy_1 = svsub_x(VecTraits::svptrue(), r_1, y_1);
76
77 2764 svint32_t v_00 = svdup_n_s32(half_);
78 2764 svint32_t v_10 = v_00;
79 2764 svint32_t v_01 = v_00;
80 2764 svint32_t v_11 = v_00;
81
82 2764 v_00 = svmlalb(v_00, vy_0, kRVWeight);
83 2764 v_01 = svmlalb(v_01, vy_1, kRVWeight);
84 2764 v_10 = svmlalt(v_10, vy_0, kRVWeight);
85 2764 v_11 = svmlalt(v_11, vy_1, kRVWeight);
86
87 2764 v_0 = combine_scaled_s16(v_00, v_10);
88 2764 v_1 = combine_scaled_s16(v_01, v_11);
89 2764 }
90
91 // Narrow the results to 8 bits
92 5528 svuint8x3_t yuv =
93 5528 svcreate3(svqxtunt(svqxtunb(y_0), y_1), svqxtunt(svqxtunb(u_0), u_1),
94 2764 svqxtunt(svqxtunb(v_0), v_1));
95
96 // Store interleaved YUV pixels to memory.
97 2764 svst3_u8(pg, dst, yuv);
98 2764 }
99
100 static constexpr size_t r_index_ = BGR ? 2 : 0;
101 static constexpr size_t g_index_ = 1;
102 static constexpr size_t b_index_ = BGR ? 0 : 2;
103 static constexpr uint32_t half_ =
104 (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
105 16584 static svint16_t combine_scaled_s16(svint32_t even,
106 svint32_t odd) KLEIDICV_STREAMING {
107 16584 return svqrshrnt(svqrshrnb(even, kWeightScale), odd, kWeightScale);
108 }
109 }; // end of class RGBToYUVBase<bool BGR>
110
111 // 3-channel input
112 template <bool BGR>
113 class RGBToYUV final : public RGBToYUVBase<BGR> {
114 public:
115 using ContextType = Context;
116 using ScalarType = uint8_t;
117 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
118
119 // Returns the number of channels in the output image.
120 316 static constexpr size_t input_channels() KLEIDICV_STREAMING { return 3; }
121
122 1382 void vector_path(ContextType ctx, const ScalarType *src,
123 ScalarType *dst) KLEIDICV_STREAMING {
124 1382 auto pg = ctx.predicate();
125 1382 svuint8x3_t svsrc = svld3(pg, src);
126 1382 svint16_t r_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, r_index_)));
127 1382 svint16_t r_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, r_index_)));
128 1382 svint16_t g_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, g_index_)));
129 1382 svint16_t g_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, g_index_)));
130 1382 svint16_t b_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, b_index_)));
131 1382 svint16_t b_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, b_index_)));
132 2764 RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1,
133 1382 dst);
134 1382 }
135
136 private:
137 static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_;
138 static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_;
139 static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_;
140 }; // end of class RGBToYUV<bool BGR>
141
142 // 4-channel input
143 template <bool BGR>
144 class RGBAToYUV final : public RGBToYUVBase<BGR>, public UsesTailPath {
145 public:
146 using ContextType = Context;
147 using ScalarType = uint8_t;
148 using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>;
149
150 412 explicit RGBAToYUV(svuint8x4_t &sv4) KLEIDICV_STREAMING
151 412 : deinterleave16_indices_(sv4) {
152 // clang-format off
153 // From the unzipped RGBA -> RBRBRBRB..., take it apart to even and odd
154 // pixels, and widen it to 16bits. For that, we need these tables:
155 // 0, FF, 4, FF, 8, FF, 12. ... red0
156 // 1, FF, 5, FF, 9, FF, 13, ... blue0
157 // 2, FF, 6, FF, 10, FF, 14, ... red1
158 // 3, FF, 7, FF, 11, FF, 15, ... blue1
159 // clang-format on
160 412 deinterleave16_indices_ =
161 824 svcreate4(svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004)),
162 412 svreinterpret_u8_u16(svindex_u16(0xFF01, 0x0004)),
163 412 svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004)),
164 412 svreinterpret_u8_u16(svindex_u16(0xFF03, 0x0004)));
165 412 }
166
167 // Returns the number of channels in the output image.
168 316 static constexpr size_t input_channels() KLEIDICV_STREAMING { return 4; }
169
170 1164 void vector_path(ContextType ctx, const ScalarType *src,
171 ScalarType *dst) KLEIDICV_STREAMING {
172 1164 auto pg = ctx.predicate();
173 1164 common_vector_path(pg, pg, pg, pg, pg, src, dst);
174 1164 }
175
176 218 void tail_path(ContextType ctx, const ScalarType *src,
177 ScalarType *dst) KLEIDICV_STREAMING {
178 218 auto pg = ctx.predicate();
179 218 svbool_t pg_0, pg_1, pg_2, pg_3;
180 218 VecTraits::make_consecutive_predicates(pg, pg_0, pg_1, pg_2, pg_3);
181 218 common_vector_path(pg, pg_0, pg_1, pg_2, pg_3, src, dst);
182 218 }
183
184 private:
185 1382 void common_vector_path(svbool_t pg, svbool_t pg_0, svbool_t pg_1,
186 svbool_t pg_2, svbool_t pg_3, const ScalarType *src,
187 ScalarType *dst) KLEIDICV_STREAMING {
188 1382 svint16_t r_0, r_1, g_0, g_1, b_0, b_1;
189
190 1382 svuint8_t src0 = svld1(pg_0, src);
191 1382 svuint8_t src1 = svld1_vnum(pg_1, src, 1);
192 1382 svuint8_t src2 = svld1_vnum(pg_2, src, 2);
193 1382 svuint8_t src3 = svld1_vnum(pg_3, src, 3);
194
195 1382 svuint8_t rb_l = svuzp1_u8(src0, src1);
196 1382 svuint8_t rb_h = svuzp1_u8(src2, src3);
197 1382 svuint8_t ga_l = svuzp2_u8(src0, src1);
198 1382 svuint8_t ga_h = svuzp2_u8(src2, src3);
199
4/4
✓ Branch 0 taken 26 times.
✓ Branch 1 taken 665 times.
✓ Branch 2 taken 26 times.
✓ Branch 3 taken 665 times.
1382 if (KLEIDICV_UNLIKELY(svcntb() >= 256)) {
200 52 svuint8_t r, g, b;
201 if constexpr (BGR) {
202 26 b = svuzp1_u8(rb_l, rb_h);
203 26 r = svuzp2_u8(rb_l, rb_h);
204 } else {
205 26 r = svuzp1_u8(rb_l, rb_h);
206 26 b = svuzp2_u8(rb_l, rb_h);
207 }
208 52 g = svuzp1_u8(ga_l, ga_h);
209 52 r_0 = svreinterpret_s16_u16(svmovlb(r));
210 52 r_1 = svreinterpret_s16_u16(svmovlt(r));
211 52 g_0 = svreinterpret_s16_u16(svmovlb(g));
212 52 g_1 = svreinterpret_s16_u16(svmovlt(g));
213 52 b_0 = svreinterpret_s16_u16(svmovlb(b));
214 52 b_1 = svreinterpret_s16_u16(svmovlt(b));
215 52 } else {
216 1330 b_0 = svreinterpret_s16_u8(
217 2660 svtbl2(svcreate2(rb_l, rb_h),
218 1330 svget4(deinterleave16_indices_, b_index_ / 2)));
219 1330 b_1 = svreinterpret_s16_u8(
220 2660 svtbl2(svcreate2(rb_l, rb_h),
221 1330 svget4(deinterleave16_indices_, b_index_ / 2 + 2)));
222 1330 r_0 = svreinterpret_s16_u8(
223 2660 svtbl2(svcreate2(rb_l, rb_h),
224 1330 svget4(deinterleave16_indices_, r_index_ / 2)));
225 1330 r_1 = svreinterpret_s16_u8(
226 2660 svtbl2(svcreate2(rb_l, rb_h),
227 1330 svget4(deinterleave16_indices_, r_index_ / 2 + 2)));
228
229 1330 g_0 = svreinterpret_s16_u8(
230 1330 svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 0)));
231 1330 g_1 = svreinterpret_s16_u8(
232 1330 svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 2)));
233 }
234 2764 RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1,
235 1382 dst);
236 1382 }
237
238 static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_;
239 static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_;
240 static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_;
241
242 svuint8x4_t &deinterleave16_indices_;
243 }; // end of class RGBAToYUV<bool BGR>
244
245 template <typename OperationType, typename ScalarType>
246 824 kleidicv_error_t rgb2yuv_operation(OperationType operation,
247 const ScalarType *src, size_t src_stride,
248 ScalarType *dst, size_t dst_stride,
249 size_t width,
250 size_t height) KLEIDICV_STREAMING {
251
16/16
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 194 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 194 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 194 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 194 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 194 times.
✓ Branch 10 taken 12 times.
✓ Branch 11 taken 194 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 194 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 194 times.
824 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
252
16/16
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 182 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 182 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 182 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 182 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 182 times.
✓ Branch 10 taken 12 times.
✓ Branch 11 taken 182 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 182 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 182 times.
776 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
253
24/24
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 170 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 158 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 158 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 170 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 158 times.
✓ Branch 10 taken 24 times.
✓ Branch 11 taken 158 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 170 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 158 times.
✓ Branch 16 taken 24 times.
✓ Branch 17 taken 158 times.
✓ Branch 18 taken 12 times.
✓ Branch 19 taken 170 times.
✓ Branch 20 taken 12 times.
✓ Branch 21 taken 158 times.
✓ Branch 22 taken 24 times.
✓ Branch 23 taken 158 times.
728 CHECK_IMAGE_SIZE(width, height);
254
255 632 Rectangle rect{width, height};
256 632 Rows src_rows{src, src_stride, operation.input_channels()};
257 632 Rows dst_rows{dst, dst_stride, 3};
258
259 632 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
260 632 return KLEIDICV_OK;
261 824 }
262
263 KLEIDICV_TARGET_FN_ATTRS
264 206 static kleidicv_error_t rgb_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
265 uint8_t *dst, size_t dst_stride,
266 size_t width,
267 size_t height) KLEIDICV_STREAMING {
268 206 RGBToYUV<false> operation;
269 618 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
270 206 height);
271 206 }
272
273 KLEIDICV_TARGET_FN_ATTRS
274 206 static kleidicv_error_t rgba_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
275 uint8_t *dst, size_t dst_stride,
276 size_t width,
277 size_t height) KLEIDICV_STREAMING {
278 206 svuint8x4_t indices;
279 206 RGBAToYUV<false> operation(indices);
280 618 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
281 206 height);
282 206 }
283
284 KLEIDICV_TARGET_FN_ATTRS
285 206 static kleidicv_error_t bgr_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
286 uint8_t *dst, size_t dst_stride,
287 size_t width,
288 size_t height) KLEIDICV_STREAMING {
289 206 RGBToYUV<true> operation;
290 618 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
291 206 height);
292 206 }
293
294 KLEIDICV_TARGET_FN_ATTRS
295 206 static kleidicv_error_t bgra_to_yuv_u8_sc(const uint8_t *src, size_t src_stride,
296 uint8_t *dst, size_t dst_stride,
297 size_t width,
298 size_t height) KLEIDICV_STREAMING {
299 206 svuint8x4_t indices;
300 206 RGBAToYUV<true> operation(indices);
301 618 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
302 206 height);
303 206 }
304
305 } // namespace KLEIDICV_TARGET_NAMESPACE
306
307 #endif // KLEIDICV_RGB_TO_YUV_SC_H
308