Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_RGB_TO_YUV_SC_H | ||
6 | #define KLEIDICV_RGB_TO_YUV_SC_H | ||
7 | |||
8 | #include <limits> | ||
9 | #include <memory> | ||
10 | |||
11 | #include "kleidicv/conversions/rgb_to_yuv.h" | ||
12 | #include "kleidicv/kleidicv.h" | ||
13 | #include "kleidicv/sve2.h" | ||
14 | |||
15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
16 | |||
17 | template <bool BGR> | ||
18 | class RGBToYUVBase : public UnrollOnce { | ||
19 | public: | ||
20 | using ContextType = Context; | ||
21 | using ScalarType = uint8_t; | ||
22 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
23 | |||
24 | protected: | ||
25 | 2764 | void vector_calculation_path(svbool_t pg, svint16_t r_0, svint16_t r_1, | |
26 | svint16_t g_0, svint16_t g_1, svint16_t b_0, | ||
27 | svint16_t b_1, | ||
28 | ScalarType *dst) KLEIDICV_STREAMING { | ||
29 | // Compute Y value in 32-bit precision | ||
30 | 2764 | svint16_t y_0, y_1; | |
31 | { | ||
32 | 2764 | svint32_t y_00 = svmullb(r_0, kRYWeight); | |
33 | 2764 | svint32_t y_01 = svmullb(r_1, kRYWeight); | |
34 | 2764 | svint32_t y_10 = svmullt(r_0, kRYWeight); | |
35 | 2764 | svint32_t y_11 = svmullt(r_1, kRYWeight); | |
36 | |||
37 | 2764 | y_00 = svmlalb(y_00, g_0, kGYWeight); | |
38 | 2764 | y_01 = svmlalb(y_01, g_1, kGYWeight); | |
39 | 2764 | y_10 = svmlalt(y_10, g_0, kGYWeight); | |
40 | 2764 | y_11 = svmlalt(y_11, g_1, kGYWeight); | |
41 | |||
42 | 2764 | y_00 = svmlalb(y_00, b_0, kBYWeight); | |
43 | 2764 | y_01 = svmlalb(y_01, b_1, kBYWeight); | |
44 | 2764 | y_10 = svmlalt(y_10, b_0, kBYWeight); | |
45 | 2764 | y_11 = svmlalt(y_11, b_1, kBYWeight); | |
46 | |||
47 | 2764 | y_0 = combine_scaled_s16(y_00, y_10); | |
48 | 2764 | y_1 = combine_scaled_s16(y_01, y_11); | |
49 | 2764 | } | |
50 | |||
51 | // Using the 16-bit Y value, calculate U | ||
52 | 2764 | svint16_t u_0, u_1; | |
53 | { | ||
54 | 2764 | svint16_t uy_0 = svsub_x(VecTraits::svptrue(), b_0, y_0); | |
55 | 2764 | svint16_t uy_1 = svsub_x(VecTraits::svptrue(), b_1, y_1); | |
56 | |||
57 | 2764 | svint32_t u_00 = svdup_n_s32(half_); | |
58 | 2764 | svint32_t u_01 = u_00; | |
59 | 2764 | svint32_t u_10 = u_00; | |
60 | 2764 | svint32_t u_11 = u_00; | |
61 | |||
62 | 2764 | u_00 = svmlalb(u_00, uy_0, kBUWeight); | |
63 | 2764 | u_01 = svmlalb(u_01, uy_1, kBUWeight); | |
64 | 2764 | u_10 = svmlalt(u_10, uy_0, kBUWeight); | |
65 | 2764 | u_11 = svmlalt(u_11, uy_1, kBUWeight); | |
66 | |||
67 | 2764 | u_0 = combine_scaled_s16(u_00, u_10); | |
68 | 2764 | u_1 = combine_scaled_s16(u_01, u_11); | |
69 | 2764 | } | |
70 | |||
71 | // Using the 16-bit Y value, calculate V | ||
72 | 2764 | svint16_t v_0, v_1; | |
73 | { | ||
74 | 2764 | svint16_t vy_0 = svsub_x(VecTraits::svptrue(), r_0, y_0); | |
75 | 2764 | svint16_t vy_1 = svsub_x(VecTraits::svptrue(), r_1, y_1); | |
76 | |||
77 | 2764 | svint32_t v_00 = svdup_n_s32(half_); | |
78 | 2764 | svint32_t v_10 = v_00; | |
79 | 2764 | svint32_t v_01 = v_00; | |
80 | 2764 | svint32_t v_11 = v_00; | |
81 | |||
82 | 2764 | v_00 = svmlalb(v_00, vy_0, kRVWeight); | |
83 | 2764 | v_01 = svmlalb(v_01, vy_1, kRVWeight); | |
84 | 2764 | v_10 = svmlalt(v_10, vy_0, kRVWeight); | |
85 | 2764 | v_11 = svmlalt(v_11, vy_1, kRVWeight); | |
86 | |||
87 | 2764 | v_0 = combine_scaled_s16(v_00, v_10); | |
88 | 2764 | v_1 = combine_scaled_s16(v_01, v_11); | |
89 | 2764 | } | |
90 | |||
91 | // Narrow the results to 8 bits | ||
92 | 5528 | svuint8x3_t yuv = | |
93 | 5528 | svcreate3(svqxtunt(svqxtunb(y_0), y_1), svqxtunt(svqxtunb(u_0), u_1), | |
94 | 2764 | svqxtunt(svqxtunb(v_0), v_1)); | |
95 | |||
96 | // Store interleaved YUV pixels to memory. | ||
97 | 2764 | svst3_u8(pg, dst, yuv); | |
98 | 2764 | } | |
99 | |||
100 | static constexpr size_t r_index_ = BGR ? 2 : 0; | ||
101 | static constexpr size_t g_index_ = 1; | ||
102 | static constexpr size_t b_index_ = BGR ? 0 : 2; | ||
103 | static constexpr uint32_t half_ = | ||
104 | (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale; | ||
105 | 16584 | static svint16_t combine_scaled_s16(svint32_t even, | |
106 | svint32_t odd) KLEIDICV_STREAMING { | ||
107 | 16584 | return svqrshrnt(svqrshrnb(even, kWeightScale), odd, kWeightScale); | |
108 | } | ||
109 | }; // end of class RGBToYUVBase<bool BGR> | ||
110 | |||
111 | // 3-channel input | ||
112 | template <bool BGR> | ||
113 | class RGBToYUV final : public RGBToYUVBase<BGR> { | ||
114 | public: | ||
115 | using ContextType = Context; | ||
116 | using ScalarType = uint8_t; | ||
117 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
118 | |||
119 | // Returns the number of channels in the output image. | ||
120 | 316 | static constexpr size_t input_channels() KLEIDICV_STREAMING { return 3; } | |
121 | |||
122 | 1382 | void vector_path(ContextType ctx, const ScalarType *src, | |
123 | ScalarType *dst) KLEIDICV_STREAMING { | ||
124 | 1382 | auto pg = ctx.predicate(); | |
125 | 1382 | svuint8x3_t svsrc = svld3(pg, src); | |
126 | 1382 | svint16_t r_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, r_index_))); | |
127 | 1382 | svint16_t r_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, r_index_))); | |
128 | 1382 | svint16_t g_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, g_index_))); | |
129 | 1382 | svint16_t g_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, g_index_))); | |
130 | 1382 | svint16_t b_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, b_index_))); | |
131 | 1382 | svint16_t b_1 = svreinterpret_s16_u16(svmovlt(svget3(svsrc, b_index_))); | |
132 | 2764 | RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1, | |
133 | 1382 | dst); | |
134 | 1382 | } | |
135 | |||
136 | private: | ||
137 | static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_; | ||
138 | static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_; | ||
139 | static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_; | ||
140 | }; // end of class RGBToYUV<bool BGR> | ||
141 | |||
142 | // 4-channel input | ||
143 | template <bool BGR> | ||
144 | class RGBAToYUV final : public RGBToYUVBase<BGR>, public UsesTailPath { | ||
145 | public: | ||
146 | using ContextType = Context; | ||
147 | using ScalarType = uint8_t; | ||
148 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
149 | |||
150 | 412 | explicit RGBAToYUV(svuint8x4_t &sv4) KLEIDICV_STREAMING | |
151 | 412 | : deinterleave16_indices_(sv4) { | |
152 | // clang-format off | ||
153 | // From the unzipped RGBA -> RBRBRBRB..., take it apart to even and odd | ||
154 | // pixels, and widen it to 16bits. For that, we need these tables: | ||
155 | // 0, FF, 4, FF, 8, FF, 12. ... red0 | ||
156 | // 1, FF, 5, FF, 9, FF, 13, ... blue0 | ||
157 | // 2, FF, 6, FF, 10, FF, 14, ... red1 | ||
158 | // 3, FF, 7, FF, 11, FF, 15, ... blue1 | ||
159 | // clang-format on | ||
160 | 412 | deinterleave16_indices_ = | |
161 | 824 | svcreate4(svreinterpret_u8_u16(svindex_u16(0xFF00, 0x0004)), | |
162 | 412 | svreinterpret_u8_u16(svindex_u16(0xFF01, 0x0004)), | |
163 | 412 | svreinterpret_u8_u16(svindex_u16(0xFF02, 0x0004)), | |
164 | 412 | svreinterpret_u8_u16(svindex_u16(0xFF03, 0x0004))); | |
165 | 412 | } | |
166 | |||
167 | // Returns the number of channels in the output image. | ||
168 | 316 | static constexpr size_t input_channels() KLEIDICV_STREAMING { return 4; } | |
169 | |||
170 | 1164 | void vector_path(ContextType ctx, const ScalarType *src, | |
171 | ScalarType *dst) KLEIDICV_STREAMING { | ||
172 | 1164 | auto pg = ctx.predicate(); | |
173 | 1164 | common_vector_path(pg, pg, pg, pg, pg, src, dst); | |
174 | 1164 | } | |
175 | |||
176 | 218 | void tail_path(ContextType ctx, const ScalarType *src, | |
177 | ScalarType *dst) KLEIDICV_STREAMING { | ||
178 | 218 | auto pg = ctx.predicate(); | |
179 | 218 | svbool_t pg_0, pg_1, pg_2, pg_3; | |
180 | 218 | VecTraits::make_consecutive_predicates(pg, pg_0, pg_1, pg_2, pg_3); | |
181 | 218 | common_vector_path(pg, pg_0, pg_1, pg_2, pg_3, src, dst); | |
182 | 218 | } | |
183 | |||
184 | private: | ||
185 | 1382 | void common_vector_path(svbool_t pg, svbool_t pg_0, svbool_t pg_1, | |
186 | svbool_t pg_2, svbool_t pg_3, const ScalarType *src, | ||
187 | ScalarType *dst) KLEIDICV_STREAMING { | ||
188 | 1382 | svint16_t r_0, r_1, g_0, g_1, b_0, b_1; | |
189 | |||
190 | 1382 | svuint8_t src0 = svld1(pg_0, src); | |
191 | 1382 | svuint8_t src1 = svld1_vnum(pg_1, src, 1); | |
192 | 1382 | svuint8_t src2 = svld1_vnum(pg_2, src, 2); | |
193 | 1382 | svuint8_t src3 = svld1_vnum(pg_3, src, 3); | |
194 | |||
195 | 1382 | svuint8_t rb_l = svuzp1_u8(src0, src1); | |
196 | 1382 | svuint8_t rb_h = svuzp1_u8(src2, src3); | |
197 | 1382 | svuint8_t ga_l = svuzp2_u8(src0, src1); | |
198 | 1382 | svuint8_t ga_h = svuzp2_u8(src2, src3); | |
199 |
4/4✓ Branch 0 taken 26 times.
✓ Branch 1 taken 665 times.
✓ Branch 2 taken 26 times.
✓ Branch 3 taken 665 times.
|
1382 | if (KLEIDICV_UNLIKELY(svcntb() >= 256)) { |
200 | 52 | svuint8_t r, g, b; | |
201 | if constexpr (BGR) { | ||
202 | 26 | b = svuzp1_u8(rb_l, rb_h); | |
203 | 26 | r = svuzp2_u8(rb_l, rb_h); | |
204 | } else { | ||
205 | 26 | r = svuzp1_u8(rb_l, rb_h); | |
206 | 26 | b = svuzp2_u8(rb_l, rb_h); | |
207 | } | ||
208 | 52 | g = svuzp1_u8(ga_l, ga_h); | |
209 | 52 | r_0 = svreinterpret_s16_u16(svmovlb(r)); | |
210 | 52 | r_1 = svreinterpret_s16_u16(svmovlt(r)); | |
211 | 52 | g_0 = svreinterpret_s16_u16(svmovlb(g)); | |
212 | 52 | g_1 = svreinterpret_s16_u16(svmovlt(g)); | |
213 | 52 | b_0 = svreinterpret_s16_u16(svmovlb(b)); | |
214 | 52 | b_1 = svreinterpret_s16_u16(svmovlt(b)); | |
215 | 52 | } else { | |
216 | 1330 | b_0 = svreinterpret_s16_u8( | |
217 | 2660 | svtbl2(svcreate2(rb_l, rb_h), | |
218 | 1330 | svget4(deinterleave16_indices_, b_index_ / 2))); | |
219 | 1330 | b_1 = svreinterpret_s16_u8( | |
220 | 2660 | svtbl2(svcreate2(rb_l, rb_h), | |
221 | 1330 | svget4(deinterleave16_indices_, b_index_ / 2 + 2))); | |
222 | 1330 | r_0 = svreinterpret_s16_u8( | |
223 | 2660 | svtbl2(svcreate2(rb_l, rb_h), | |
224 | 1330 | svget4(deinterleave16_indices_, r_index_ / 2))); | |
225 | 1330 | r_1 = svreinterpret_s16_u8( | |
226 | 2660 | svtbl2(svcreate2(rb_l, rb_h), | |
227 | 1330 | svget4(deinterleave16_indices_, r_index_ / 2 + 2))); | |
228 | |||
229 | 1330 | g_0 = svreinterpret_s16_u8( | |
230 | 1330 | svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 0))); | |
231 | 1330 | g_1 = svreinterpret_s16_u8( | |
232 | 1330 | svtbl2(svcreate2(ga_l, ga_h), svget4(deinterleave16_indices_, 2))); | |
233 | } | ||
234 | 2764 | RGBToYUVBase<BGR>::vector_calculation_path(pg, r_0, r_1, g_0, g_1, b_0, b_1, | |
235 | 1382 | dst); | |
236 | 1382 | } | |
237 | |||
238 | static constexpr size_t r_index_ = RGBToYUVBase<BGR>::r_index_; | ||
239 | static constexpr size_t g_index_ = RGBToYUVBase<BGR>::g_index_; | ||
240 | static constexpr size_t b_index_ = RGBToYUVBase<BGR>::b_index_; | ||
241 | |||
242 | svuint8x4_t &deinterleave16_indices_; | ||
243 | }; // end of class RGBAToYUV<bool BGR> | ||
244 | |||
245 | template <typename OperationType, typename ScalarType> | ||
246 | 824 | kleidicv_error_t rgb2yuv_operation(OperationType operation, | |
247 | const ScalarType *src, size_t src_stride, | ||
248 | ScalarType *dst, size_t dst_stride, | ||
249 | size_t width, | ||
250 | size_t height) KLEIDICV_STREAMING { | ||
251 |
16/16✓ Branch 0 taken 12 times.
✓ Branch 1 taken 194 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 194 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 194 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 194 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 194 times.
✓ Branch 10 taken 12 times.
✓ Branch 11 taken 194 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 194 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 194 times.
|
824 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
252 |
16/16✓ Branch 0 taken 12 times.
✓ Branch 1 taken 182 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 182 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 182 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 182 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 182 times.
✓ Branch 10 taken 12 times.
✓ Branch 11 taken 182 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 182 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 182 times.
|
776 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
253 |
24/24✓ Branch 0 taken 12 times.
✓ Branch 1 taken 170 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 158 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 158 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 170 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 158 times.
✓ Branch 10 taken 24 times.
✓ Branch 11 taken 158 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 170 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 158 times.
✓ Branch 16 taken 24 times.
✓ Branch 17 taken 158 times.
✓ Branch 18 taken 12 times.
✓ Branch 19 taken 170 times.
✓ Branch 20 taken 12 times.
✓ Branch 21 taken 158 times.
✓ Branch 22 taken 24 times.
✓ Branch 23 taken 158 times.
|
728 | CHECK_IMAGE_SIZE(width, height); |
254 | |||
255 | 632 | Rectangle rect{width, height}; | |
256 | 632 | Rows src_rows{src, src_stride, operation.input_channels()}; | |
257 | 632 | Rows dst_rows{dst, dst_stride, 3}; | |
258 | |||
259 | 632 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
260 | 632 | return KLEIDICV_OK; | |
261 | 824 | } | |
262 | |||
263 | KLEIDICV_TARGET_FN_ATTRS | ||
264 | 206 | static kleidicv_error_t rgb_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, | |
265 | uint8_t *dst, size_t dst_stride, | ||
266 | size_t width, | ||
267 | size_t height) KLEIDICV_STREAMING { | ||
268 | 206 | RGBToYUV<false> operation; | |
269 | 618 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
270 | 206 | height); | |
271 | 206 | } | |
272 | |||
273 | KLEIDICV_TARGET_FN_ATTRS | ||
274 | 206 | static kleidicv_error_t rgba_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, | |
275 | uint8_t *dst, size_t dst_stride, | ||
276 | size_t width, | ||
277 | size_t height) KLEIDICV_STREAMING { | ||
278 | 206 | svuint8x4_t indices; | |
279 | 206 | RGBAToYUV<false> operation(indices); | |
280 | 618 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
281 | 206 | height); | |
282 | 206 | } | |
283 | |||
284 | KLEIDICV_TARGET_FN_ATTRS | ||
285 | 206 | static kleidicv_error_t bgr_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, | |
286 | uint8_t *dst, size_t dst_stride, | ||
287 | size_t width, | ||
288 | size_t height) KLEIDICV_STREAMING { | ||
289 | 206 | RGBToYUV<true> operation; | |
290 | 618 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
291 | 206 | height); | |
292 | 206 | } | |
293 | |||
294 | KLEIDICV_TARGET_FN_ATTRS | ||
295 | 206 | static kleidicv_error_t bgra_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, | |
296 | uint8_t *dst, size_t dst_stride, | ||
297 | size_t width, | ||
298 | size_t height) KLEIDICV_STREAMING { | ||
299 | 206 | svuint8x4_t indices; | |
300 | 206 | RGBAToYUV<true> operation(indices); | |
301 | 618 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
302 | 206 | height); | |
303 | 206 | } | |
304 | |||
305 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
306 | |||
307 | #endif // KLEIDICV_RGB_TO_YUV_SC_H | ||
308 |