Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_RGB_TO_YUV420_SC_H | ||
6 | #define KLEIDICV_RGB_TO_YUV420_SC_H | ||
7 | |||
8 | #include <algorithm> | ||
9 | #include <functional> | ||
10 | #include <utility> | ||
11 | |||
12 | #include "kleidicv/kleidicv.h" | ||
13 | #include "kleidicv/sve2.h" | ||
14 | #include "yuv420_coefficients.h" | ||
15 | |||
16 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
17 | |||
18 | template <bool kAlpha, bool RGB, bool kInterleave> | ||
19 | class RGBxorBGRxToYUV420 { | ||
20 | public: | ||
21 | using ArrayOfFour_svuint32 = ScalableVectorArray1D<svuint32_t, 4>; | ||
22 | using ArrayOfFour_svint32 = ScalableVectorArray1D<svint32_t, 4>; | ||
23 | using ArrayOfTwo_svint32 = ScalableVectorArray1D<svint32_t, 2>; | ||
24 | |||
25 | 1128 | static kleidicv_error_t rgb2yuv420_operation_sc( | |
26 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
27 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
28 | bool v_first, size_t begin, size_t end) KLEIDICV_STREAMING { | ||
29 | 1128 | size_t row_begin = begin * 2; | |
30 | 1128 | size_t row_end = std::min<size_t>(height, end * 2); | |
31 | 1128 | const uint8_t *src_row = nullptr; | |
32 | 1128 | uint8_t *y_row = nullptr; | |
33 | 1128 | uint8_t *u_row = nullptr; | |
34 | 1128 | uint8_t *v_row = nullptr; | |
35 |
8/8✓ Branch 0 taken 282 times.
✓ Branch 1 taken 14430 times.
✓ Branch 2 taken 282 times.
✓ Branch 3 taken 14430 times.
✓ Branch 4 taken 282 times.
✓ Branch 5 taken 14430 times.
✓ Branch 6 taken 282 times.
✓ Branch 7 taken 14430 times.
|
58848 | for (size_t h = row_begin; h < row_end; h++) { |
36 | 57720 | src_row = src + src_stride * h; | |
37 | 57720 | y_row = y_dst + y_stride * h; | |
38 | 57720 | bool evenRow = (h & 1) == 0; | |
39 |
8/8✓ Branch 0 taken 7146 times.
✓ Branch 1 taken 7284 times.
✓ Branch 2 taken 7146 times.
✓ Branch 3 taken 7284 times.
✓ Branch 4 taken 7146 times.
✓ Branch 5 taken 7284 times.
✓ Branch 6 taken 7146 times.
✓ Branch 7 taken 7284 times.
|
57720 | if (evenRow) { |
40 | if constexpr (kInterleave) { | ||
41 | 14736 | u_row = uv_dst + uv_stride * (h / 2); | |
42 | } else { | ||
43 | 14400 | u_row = | |
44 | 14400 | uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2); | |
45 | // Pointer to the start of the V plane. | ||
46 | // The V plane follows the U plane. Both U and V planes are | ||
47 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 | ||
48 | // rows), and are often stored in a single contiguous chroma region in | ||
49 | // memory. Depending on image height and stride, the starting offset | ||
50 | // of V may require adjustment to maintain correct alignment. In | ||
51 | // particular, the chroma rows may not align perfectly, so a | ||
52 | // fractional offset (in rows) is applied to calculate the V plane | ||
53 | // position. The formula used here accounts for this by adjusting | ||
54 | // based on row parity, assuming consistent memory layout across the | ||
55 | // Y, U, and V planes. | ||
56 | 28800 | v_row = uv_dst + uv_stride * ((h + height + 1) / 4) + | |
57 | 14400 | (((h + height + 1) / 2) % 2) * ((width + 1) / 2); | |
58 | } | ||
59 | 29136 | } | |
60 | |||
61 | 57720 | const size_t kVectorLength = svcntb(); | |
62 | 57720 | LoopUnroll2 loop{width, kVectorLength}; | |
63 | |||
64 | 71544 | loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
65 | 13824 | svbool_t pg = svptrue_b8(); | |
66 | |||
67 | 27648 | vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow, | |
68 | 13824 | pg, pg, pg); | |
69 | 13824 | }); | |
70 | |||
71 | 114960 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
72 | 57240 | svbool_t pg = svwhilelt_b8_u64(index, length); | |
73 | 57240 | svbool_t pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) / 2); | |
74 |
8/8✓ Branch 0 taken 14526 times.
✓ Branch 1 taken 14310 times.
✓ Branch 2 taken 14526 times.
✓ Branch 3 taken 14310 times.
✓ Branch 4 taken 14526 times.
✓ Branch 5 taken 14310 times.
✓ Branch 6 taken 14526 times.
✓ Branch 7 taken 14310 times.
|
115344 | while (svptest_first(svptrue_b8(), pg)) { |
75 | 116208 | vector_path(src_row, y_row, u_row, v_row, v_first, index, evenRow, pg, | |
76 | 58104 | pg_half); | |
77 | 58104 | index += kVectorLength; | |
78 | 58104 | pg = svwhilelt_b8_u64(index, length); | |
79 | 58104 | pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) / 2); | |
80 | } | ||
81 | 57240 | }); | |
82 | 57720 | } | |
83 | 1128 | return KLEIDICV_OK; | |
84 | 1128 | } | |
85 | |||
86 | private: | ||
87 | 13824 | static void vector_path_2x(const uint8_t *src_row, uint8_t *y_row, | |
88 | uint8_t *u_row, uint8_t *v_row, bool v_first, | ||
89 | const size_t index, const bool evenRow, | ||
90 | const svbool_t pg0, const svbool_t pg1, | ||
91 | const svbool_t pg_half) KLEIDICV_STREAMING { | ||
92 | 13824 | const size_t kVectorLength = svcntb(); | |
93 | 13824 | svuint32_t r0_0, r0_1, r0_2, r0_3, g0_0, g0_1, g0_2, g0_3, b0_0, b0_1, b0_2, | |
94 | b0_3, r1_0, r1_1, r1_2, r1_3, g1_0, g1_1, g1_2, g1_3, b1_0, b1_1, b1_2, | ||
95 | b1_3; | ||
96 | |||
97 | 13824 | ArrayOfFour_svuint32 r0 = { | |
98 | 13824 | {std::ref(r0_0), std::ref(r0_1), std::ref(r0_2), std::ref(r0_3)}}; | |
99 | 13824 | ArrayOfFour_svuint32 g0 = { | |
100 | 13824 | {std::ref(g0_0), std::ref(g0_1), std::ref(g0_2), std::ref(g0_3)}}; | |
101 | 13824 | ArrayOfFour_svuint32 b0 = { | |
102 | 13824 | {std::ref(b0_0), std::ref(b0_1), std::ref(b0_2), std::ref(b0_3)}}; | |
103 | 13824 | ArrayOfFour_svuint32 r1 = { | |
104 | 13824 | {std::ref(r1_0), std::ref(r1_1), std::ref(r1_2), std::ref(r1_3)}}; | |
105 | 13824 | ArrayOfFour_svuint32 g1 = { | |
106 | 13824 | {std::ref(g1_0), std::ref(g1_1), std::ref(g1_2), std::ref(g1_3)}}; | |
107 | 13824 | ArrayOfFour_svuint32 b1 = { | |
108 | 13824 | {std::ref(b1_0), std::ref(b1_1), std::ref(b1_2), std::ref(b1_3)}}; | |
109 | |||
110 | 13824 | load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, scn * index, pg0, pg1); | |
111 | |||
112 | 13824 | svuint8_t y0 = rgb_to_y(r0, g0, b0); | |
113 | |||
114 | 13824 | svuint8_t y1 = rgb_to_y(r1, g1, b1); | |
115 | |||
116 | 13824 | svst1(pg0, y_row + index, y0); | |
117 | 13824 | svst1(pg1, y_row + index + kVectorLength, y1); | |
118 | |||
119 |
8/8✓ Branch 0 taken 1664 times.
✓ Branch 1 taken 1792 times.
✓ Branch 2 taken 1664 times.
✓ Branch 3 taken 1792 times.
✓ Branch 4 taken 1664 times.
✓ Branch 5 taken 1792 times.
✓ Branch 6 taken 1664 times.
✓ Branch 7 taken 1792 times.
|
13824 | if (evenRow) { |
120 | 7168 | svuint8_t u, v; | |
121 | 7168 | svint32_t r_even_0 = svreinterpret_s32(r0_0); | |
122 | 7168 | svint32_t r_even_1 = svreinterpret_s32(r0_1); | |
123 | 7168 | svint32_t r_even_2 = svreinterpret_s32(r1_0); | |
124 | 7168 | svint32_t r_even_3 = svreinterpret_s32(r1_1); | |
125 | 7168 | svint32_t g_even_0 = svreinterpret_s32(g0_0); | |
126 | 7168 | svint32_t g_even_1 = svreinterpret_s32(g0_1); | |
127 | 7168 | svint32_t g_even_2 = svreinterpret_s32(g1_0); | |
128 | 7168 | svint32_t g_even_3 = svreinterpret_s32(g1_1); | |
129 | 7168 | svint32_t b_even_0 = svreinterpret_s32(b0_0); | |
130 | 7168 | svint32_t b_even_1 = svreinterpret_s32(b0_1); | |
131 | 7168 | svint32_t b_even_2 = svreinterpret_s32(b1_0); | |
132 | 7168 | svint32_t b_even_3 = svreinterpret_s32(b1_1); | |
133 | |||
134 | 14336 | ArrayOfFour_svint32 r_even = {{std::ref(r_even_0), std::ref(r_even_1), | |
135 | 14336 | std::ref(r_even_2), std::ref(r_even_3)}}; | |
136 | 14336 | ArrayOfFour_svint32 g_even = {{std::ref(g_even_0), std::ref(g_even_1), | |
137 | 14336 | std::ref(g_even_2), std::ref(g_even_3)}}; | |
138 | 14336 | ArrayOfFour_svint32 b_even = {{std::ref(b_even_0), std::ref(b_even_1), | |
139 | 14336 | std::ref(b_even_2), std::ref(b_even_3)}}; | |
140 | |||
141 | 7168 | rgb_to_uv_2x(r_even, g_even, b_even, u, v); | |
142 | |||
143 |
8/8✓ Branch 0 taken 896 times.
✓ Branch 1 taken 896 times.
✓ Branch 2 taken 896 times.
✓ Branch 3 taken 896 times.
✓ Branch 4 taken 896 times.
✓ Branch 5 taken 896 times.
✓ Branch 6 taken 896 times.
✓ Branch 7 taken 896 times.
|
7168 | if (v_first) { |
144 | 3584 | swap_scalable(u, v); | |
145 | 3584 | } | |
146 | |||
147 | if constexpr (kInterleave) { | ||
148 | 3584 | svuint8x2_t uv = svcreate2(u, v); | |
149 | 3584 | svst2_u8(pg_half, u_row + index, uv); | |
150 | 3584 | } else { | |
151 | 3584 | svst1(pg_half, u_row + index / 2, u); | |
152 | 3584 | svst1(pg_half, v_row + index / 2, v); | |
153 | } | ||
154 | 7168 | } | |
155 | 13824 | } | |
156 | |||
157 | 58104 | static void vector_path(const uint8_t *src_row, uint8_t *y_row, | |
158 | uint8_t *u_row, uint8_t *v_row, bool v_first, | ||
159 | const size_t index, const bool evenRow, | ||
160 | const svbool_t pg0, | ||
161 | const svbool_t pg_half) KLEIDICV_STREAMING { | ||
162 | 58104 | svuint32_t r0_0, r0_1, r0_2, r0_3, g0_0, g0_1, g0_2, g0_3, b0_0, b0_1, b0_2, | |
163 | b0_3; | ||
164 | |||
165 | 58104 | ArrayOfFour_svuint32 r0 = { | |
166 | 58104 | {std::ref(r0_0), std::ref(r0_1), std::ref(r0_2), std::ref(r0_3)}}; | |
167 | 58104 | ArrayOfFour_svuint32 g0 = { | |
168 | 58104 | {std::ref(g0_0), std::ref(g0_1), std::ref(g0_2), std::ref(g0_3)}}; | |
169 | 58104 | ArrayOfFour_svuint32 b0 = { | |
170 | 58104 | {std::ref(b0_0), std::ref(b0_1), std::ref(b0_2), std::ref(b0_3)}}; | |
171 | |||
172 | 58104 | load_rgb(r0, g0, b0, src_row, scn * index, pg0); | |
173 | |||
174 | 58104 | svuint8_t y0 = rgb_to_y(r0, g0, b0); | |
175 | |||
176 | 58104 | svst1(pg0, y_row + index, y0); | |
177 | |||
178 |
8/8✓ Branch 0 taken 7198 times.
✓ Branch 1 taken 7328 times.
✓ Branch 2 taken 7198 times.
✓ Branch 3 taken 7328 times.
✓ Branch 4 taken 7198 times.
✓ Branch 5 taken 7328 times.
✓ Branch 6 taken 7198 times.
✓ Branch 7 taken 7328 times.
|
58104 | if (evenRow) { |
179 | 29312 | svuint8_t u, v; | |
180 | 29312 | svint32_t r_even_0 = svreinterpret_s32(r0_0); | |
181 | 29312 | svint32_t r_even_1 = svreinterpret_s32(r0_1); | |
182 | 29312 | svint32_t g_even_0 = svreinterpret_s32(g0_0); | |
183 | 29312 | svint32_t g_even_1 = svreinterpret_s32(g0_1); | |
184 | 29312 | svint32_t b_even_0 = svreinterpret_s32(b0_0); | |
185 | 29312 | svint32_t b_even_1 = svreinterpret_s32(b0_1); | |
186 | |||
187 | 29312 | ArrayOfTwo_svint32 r_even = {{std::ref(r_even_0), std::ref(r_even_1)}}; | |
188 | 29312 | ArrayOfTwo_svint32 g_even = {{std::ref(g_even_0), std::ref(g_even_1)}}; | |
189 | 29312 | ArrayOfTwo_svint32 b_even = {{std::ref(b_even_0), std::ref(b_even_1)}}; | |
190 | |||
191 | 29312 | rgb_to_uv(r_even, g_even, b_even, u, v); | |
192 | |||
193 |
8/8✓ Branch 0 taken 7006 times.
✓ Branch 1 taken 322 times.
✓ Branch 2 taken 7006 times.
✓ Branch 3 taken 322 times.
✓ Branch 4 taken 7006 times.
✓ Branch 5 taken 322 times.
✓ Branch 6 taken 7006 times.
✓ Branch 7 taken 322 times.
|
29312 | if (v_first) { |
194 | 1288 | swap_scalable(u, v); | |
195 | 1288 | } | |
196 | |||
197 | if constexpr (kInterleave) { | ||
198 | 14824 | svuint8x2_t uv = svcreate2(u, v); | |
199 | 14824 | svst2_u8(pg_half, u_row + index, uv); | |
200 | 14824 | } else { | |
201 | 14488 | svst1(pg_half, u_row + index / 2, u); | |
202 | 14488 | svst1(pg_half, v_row + index / 2, v); | |
203 | } | ||
204 | 29312 | } | |
205 | 58104 | } | |
206 | |||
207 | 85752 | static svuint8_t rgb_to_y(ArrayOfFour_svuint32 r, ArrayOfFour_svuint32 g, | |
208 | ArrayOfFour_svuint32 b) KLEIDICV_STREAMING { | ||
209 | 85752 | const uint32_t kShifted16 = (16 << kWeightScale); | |
210 | 85752 | const uint32_t kHalfShift = (1 << (kWeightScale - 1)); | |
211 | |||
212 | 85752 | svbool_t pg = svptrue_b32(); | |
213 | |||
214 | // Y = kR*R + kG*G + kB*B + rounding bias | ||
215 | 85752 | svuint32_t bias = svdup_u32(kHalfShift + kShifted16); | |
216 | 85752 | svuint32_t y_0 = bias; | |
217 | 85752 | svuint32_t y_1 = bias; | |
218 | 85752 | svuint32_t y_2 = bias; | |
219 | 85752 | svuint32_t y_3 = bias; | |
220 | |||
221 | 85752 | ArrayOfFour_svuint32 y = { | |
222 | 85752 | {std::ref(y_0), std::ref(y_1), std::ref(y_2), std::ref(y_3)}}; | |
223 | |||
224 | KLEIDICV_FORCE_LOOP_UNROLL | ||
225 |
8/8✓ Branch 0 taken 21438 times.
✓ Branch 1 taken 85752 times.
✓ Branch 2 taken 21438 times.
✓ Branch 3 taken 85752 times.
✓ Branch 4 taken 21438 times.
✓ Branch 5 taken 85752 times.
✓ Branch 6 taken 21438 times.
✓ Branch 7 taken 85752 times.
|
428760 | for (int i = 0; i < 4; i++) { |
226 | 343008 | y(i) = svmla_n_u32_x(pg, y(i), r(i), kRYWeight); | |
227 | 343008 | y(i) = svmla_n_u32_x(pg, y(i), g(i), kGYWeight); | |
228 | 343008 | y(i) = svmla_n_u32_x(pg, y(i), b(i), kBYWeight); | |
229 | 343008 | } | |
230 | |||
231 | 85752 | svuint16_t y_b = svshrnb_n_u32(y(0), kWeightScale - 8); | |
232 | 85752 | y_b = svshrnt_n_u32(y_b, y(2), kWeightScale - 8); // 0, 1, 2, 3, 4, 5, 6, 7 | |
233 | 85752 | svuint16_t y_t = svshrnb_n_u32(y(1), kWeightScale - 8); | |
234 | 85752 | y_t = svshrnt_n_u32(y_t, y(3), | |
235 | kWeightScale - 8); // 8, 9, 10, 11, 12, 13, 14, 15 | ||
236 | |||
237 | 171504 | return svuzp2_u8(svreinterpret_u8(y_b), svreinterpret_u8(y_t)); | |
238 | 85752 | } | |
239 | |||
240 | 14336 | static svuint8_t compute_u_or_v_2x(ArrayOfFour_svint32 r, | |
241 | ArrayOfFour_svint32 g, | ||
242 | ArrayOfFour_svint32 b, const int r_coeff, | ||
243 | const int g_coeff, | ||
244 | const int b_coeff) KLEIDICV_STREAMING { | ||
245 | 14336 | svbool_t pg = svptrue_b32(); | |
246 | 14336 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
247 | 14336 | const int kShifted128 = (128 << kWeightScale); | |
248 | 14336 | svint32_t bias = svdup_s32(kHalfShift + kShifted128); | |
249 | 14336 | svint32_t uv0 = bias; | |
250 | 14336 | svint32_t uv1 = bias; | |
251 | 14336 | svint32_t uv2 = bias; | |
252 | 14336 | svint32_t uv3 = bias; | |
253 | |||
254 | 14336 | ArrayOfFour_svint32 uv = { | |
255 | 14336 | {std::ref(uv0), std::ref(uv1), std::ref(uv2), std::ref(uv3)}}; | |
256 | |||
257 | KLEIDICV_FORCE_LOOP_UNROLL | ||
258 |
8/8✓ Branch 0 taken 3584 times.
✓ Branch 1 taken 14336 times.
✓ Branch 2 taken 3584 times.
✓ Branch 3 taken 14336 times.
✓ Branch 4 taken 3584 times.
✓ Branch 5 taken 14336 times.
✓ Branch 6 taken 3584 times.
✓ Branch 7 taken 14336 times.
|
71680 | for (int i = 0; i < 4; i++) { |
259 | 57344 | uv(i) = svmla_n_s32_x(pg, uv(i), r(i), r_coeff); | |
260 | 57344 | uv(i) = svmla_n_s32_x(pg, uv(i), g(i), g_coeff); | |
261 | 57344 | uv(i) = svmla_n_s32_x(pg, uv(i), b(i), b_coeff); | |
262 | 57344 | } | |
263 | |||
264 | 28672 | svint16_t uv_b = | |
265 | 14336 | svuzp2_s16(svreinterpret_s16(uv(0)), svreinterpret_s16(uv(1))); | |
266 | 28672 | svint16_t uv_t = | |
267 | 14336 | svuzp2_s16(svreinterpret_s16(uv(2)), svreinterpret_s16(uv(3))); | |
268 | |||
269 | 14336 | uv_b = svasr_n_s16_x(pg, uv_b, kWeightScale - 16); | |
270 | 14336 | uv_t = svasr_n_s16_x(pg, uv_t, kWeightScale - 16); | |
271 | |||
272 | 28672 | return svuzp1_u8(svreinterpret_u8(uv_b), svreinterpret_u8(uv_t)); | |
273 | 14336 | } | |
274 | |||
275 | 7168 | static void rgb_to_uv_2x(ArrayOfFour_svint32 r, ArrayOfFour_svint32 g, | |
276 | ArrayOfFour_svint32 b, svuint8_t &u, | ||
277 | svuint8_t &v) KLEIDICV_STREAMING { | ||
278 | // ---------------- U (Cb) Component ---------------- | ||
279 | // U = R * kRU + G * kGU + B * kBU + bias | ||
280 | 7168 | u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight); | |
281 | |||
282 | // ---------------- V (Cr) Component ---------------- | ||
283 | // V = R * kBU + G * kGV + B * kBV + bias | ||
284 | 7168 | v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight); | |
285 | 7168 | } | |
286 | |||
287 | 58624 | static svuint8_t compute_u_or_v(ArrayOfTwo_svint32 r, ArrayOfTwo_svint32 g, | |
288 | ArrayOfTwo_svint32 b, const int r_coeff, | ||
289 | const int g_coeff, | ||
290 | const int b_coeff) KLEIDICV_STREAMING { | ||
291 | 58624 | svbool_t pg = svptrue_b32(); | |
292 | 58624 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
293 | 58624 | const int kShifted128 = (128 << kWeightScale); | |
294 | |||
295 | 58624 | svint32_t bias = svdup_s32(kHalfShift + kShifted128); | |
296 | 58624 | svint32_t uv0 = bias; | |
297 | 58624 | svint32_t uv1 = bias; | |
298 | |||
299 | 58624 | ArrayOfTwo_svint32 uv = {{std::ref(uv0), std::ref(uv1)}}; | |
300 | |||
301 | KLEIDICV_FORCE_LOOP_UNROLL | ||
302 |
8/8✓ Branch 0 taken 14656 times.
✓ Branch 1 taken 29312 times.
✓ Branch 2 taken 14656 times.
✓ Branch 3 taken 29312 times.
✓ Branch 4 taken 14656 times.
✓ Branch 5 taken 29312 times.
✓ Branch 6 taken 14656 times.
✓ Branch 7 taken 29312 times.
|
175872 | for (int i = 0; i < 2; i++) { |
303 | 117248 | uv(i) = svmla_n_s32_x(pg, uv(i), r(i), r_coeff); | |
304 | 117248 | uv(i) = svmla_n_s32_x(pg, uv(i), g(i), g_coeff); | |
305 | 117248 | uv(i) = svmla_n_s32_x(pg, uv(i), b(i), b_coeff); | |
306 | 117248 | } | |
307 | |||
308 | 117248 | svint16_t output = | |
309 | 58624 | svuzp2_s16(svreinterpret_s16(uv(0)), svreinterpret_s16(uv(1))); | |
310 | |||
311 | 58624 | output = svasr_n_s16_x(pg, output, kWeightScale - 16); | |
312 | |||
313 | 117248 | return svuzp1_u8(svreinterpret_u8(output), svreinterpret_u8(output)); | |
314 | 58624 | } | |
315 | |||
316 | 29312 | static void rgb_to_uv(ArrayOfTwo_svint32 r, ArrayOfTwo_svint32 g, | |
317 | ArrayOfTwo_svint32 b, svuint8_t &u, | ||
318 | svuint8_t &v) KLEIDICV_STREAMING { | ||
319 | // ---------------- U (Cb) Component ---------------- | ||
320 | // U = R * kRU + G * kGU + B * kBU + bias | ||
321 | 29312 | u = compute_u_or_v(r, g, b, kRUWeight, kGUWeight, kBUWeight); | |
322 | |||
323 | // ---------------- V (Cr) Component ---------------- | ||
324 | // V = R * kBU + G * kGV + B * kBV + bias | ||
325 | 29312 | v = compute_u_or_v(r, g, b, kBUWeight, kGVWeight, kBVWeight); | |
326 | 29312 | } | |
327 | |||
328 | 85752 | static void load_rgb(ArrayOfFour_svuint32 &r, ArrayOfFour_svuint32 &g, | |
329 | ArrayOfFour_svuint32 &b, const uint8_t *src_row, | ||
330 | const size_t w, const svbool_t &pg0) KLEIDICV_STREAMING { | ||
331 | 85752 | svuint8_t b0, g0, r0; | |
332 | if constexpr (kAlpha) { | ||
333 | // 4-channel input (RGBA or BGRA) | ||
334 | 42876 | svuint8x4_t vsrc0 = svld4(pg0, src_row + w); | |
335 | |||
336 | 42876 | b0 = svget4(vsrc0, b_index); | |
337 | 42876 | g0 = svget4(vsrc0, g_index); | |
338 | 42876 | r0 = svget4(vsrc0, r_index); | |
339 | |||
340 | 42876 | } else { | |
341 | // 3-channel input (RGB or BGR) | ||
342 | 42876 | svuint8x3_t vsrc0 = svld3(pg0, src_row + w); | |
343 | |||
344 | 42876 | b0 = svget3(vsrc0, b_index); | |
345 | 42876 | g0 = svget3(vsrc0, g_index); | |
346 | 42876 | r0 = svget3(vsrc0, r_index); | |
347 | 42876 | } | |
348 | 85752 | svuint16_t r0_lo = svmovlb(r0); | |
349 | 85752 | svuint16_t r0_hi = svmovlt(r0); | |
350 | 85752 | r(0) = svunpklo(r0_lo); // 0, 2, 4, 6 | |
351 | 85752 | r(1) = svunpkhi(r0_lo); // 8, 10, 12, 14 | |
352 | 85752 | r(2) = svunpklo(r0_hi); // 1, 3, 5, 7 | |
353 | 85752 | r(3) = svunpkhi(r0_hi); // 9, 11, 13, 15 | |
354 | |||
355 | 85752 | svuint16_t g0_lo = svmovlb(g0); | |
356 | 85752 | svuint16_t g0_hi = svmovlt(g0); | |
357 | 85752 | g(0) = svunpklo(g0_lo); | |
358 | 85752 | g(1) = svunpkhi(g0_lo); | |
359 | 85752 | g(2) = svunpklo(g0_hi); | |
360 | 85752 | g(3) = svunpkhi(g0_hi); | |
361 | |||
362 | 85752 | svuint16_t b0_lo = svmovlb(b0); | |
363 | 85752 | svuint16_t b0_hi = svmovlt(b0); | |
364 | 85752 | b(0) = svunpklo(b0_lo); | |
365 | 85752 | b(1) = svunpkhi(b0_lo); | |
366 | 85752 | b(2) = svunpklo(b0_hi); | |
367 | 85752 | b(3) = svunpkhi(b0_hi); | |
368 | 85752 | } | |
369 | |||
370 | 13824 | static void load_rgb_2x(ArrayOfFour_svuint32 &r0, ArrayOfFour_svuint32 &g0, | |
371 | ArrayOfFour_svuint32 &b0, ArrayOfFour_svuint32 &r1, | ||
372 | ArrayOfFour_svuint32 &g1, ArrayOfFour_svuint32 &b1, | ||
373 | const uint8_t *src_row, const size_t w, | ||
374 | const svbool_t pg0, | ||
375 | const svbool_t pg1) KLEIDICV_STREAMING { | ||
376 | 13824 | const size_t kVectorLength = svcntb(); | |
377 | 13824 | load_rgb(r0, g0, b0, src_row, w, pg0); | |
378 | |||
379 | 13824 | load_rgb(r1, g1, b1, src_row, w + scn * kVectorLength, pg1); | |
380 | 13824 | } | |
381 | |||
382 | static constexpr int b_index = RGB ? 2 : 0; | ||
383 | static constexpr int g_index = 1; | ||
384 | static constexpr int r_index = RGB ? 0 : 2; | ||
385 | static constexpr size_t scn = kAlpha ? 4 : 3; | ||
386 | }; | ||
387 | |||
388 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
389 | |||
390 | #endif // KLEIDICV_RGB_TO_YUV420_SC_H | ||
391 |