Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_RGB_TO_YUV420_H | ||
6 | #define KLEIDICV_RGB_TO_YUV420_H | ||
7 | |||
8 | #include <algorithm> | ||
9 | #include <utility> | ||
10 | |||
11 | #include "kleidicv/kleidicv.h" | ||
12 | #include "kleidicv/neon.h" | ||
13 | #include "yuv420_coefficients.h" | ||
14 | |||
15 | namespace kleidicv::neon { | ||
16 | |||
17 | template <bool kAlpha, bool RGB, bool kInterleave> | ||
18 | class RGBxorBGRxToYUV420 { | ||
19 | public: | ||
20 | 376 | static kleidicv_error_t rgb2yuv420_operation( | |
21 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
22 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
23 | bool v_first, size_t begin, size_t end) { | ||
24 | 376 | size_t row_begin = begin * 2; | |
25 | 376 | size_t row_end = std::min<size_t>(height, end * 2); | |
26 | |||
27 | 376 | const uint8_t *src_row = nullptr; | |
28 | 376 | uint8_t *y_row = nullptr; | |
29 | 376 | uint8_t *u_row = nullptr; | |
30 | 376 | uint8_t *v_row = nullptr; | |
31 |
8/8✓ Branch 0 taken 94 times.
✓ Branch 1 taken 4810 times.
✓ Branch 2 taken 94 times.
✓ Branch 3 taken 4810 times.
✓ Branch 4 taken 94 times.
✓ Branch 5 taken 4810 times.
✓ Branch 6 taken 94 times.
✓ Branch 7 taken 4810 times.
|
19616 | for (size_t h = row_begin; h < row_end; h++) { |
32 | 19240 | src_row = src + src_stride * h; | |
33 | 19240 | y_row = y_dst + y_stride * h; | |
34 | |||
35 | 19240 | bool evenRow = (h & 1) == 0; | |
36 | |||
37 |
8/8✓ Branch 0 taken 2382 times.
✓ Branch 1 taken 2428 times.
✓ Branch 2 taken 2382 times.
✓ Branch 3 taken 2428 times.
✓ Branch 4 taken 2382 times.
✓ Branch 5 taken 2428 times.
✓ Branch 6 taken 2382 times.
✓ Branch 7 taken 2428 times.
|
19240 | if (evenRow) { |
38 | if constexpr (kInterleave) { | ||
39 | 4912 | u_row = uv_dst + uv_stride * (h / 2); | |
40 | } else { | ||
41 | 4800 | u_row = | |
42 | 4800 | uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2); | |
43 | // Pointer to the start of the V plane. | ||
44 | // The V plane follows the U plane. Both U and V planes are | ||
45 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 | ||
46 | // rows) and stored in a single contiguous chroma region. | ||
47 | // Depending on image height and stride, the starting offset | ||
48 | // of V may require adjustment, so a | ||
49 | // fractional offset (in rows) is applied to calculate the V plane | ||
50 | // position. | ||
51 | 9600 | v_row = uv_dst + uv_stride * ((h + height + 1) / 4) + | |
52 | 4800 | (((h + height + 1) / 2) % 2) * ((width + 1) / 2); | |
53 | } | ||
54 | 9712 | } | |
55 | |||
56 | 19240 | LoopUnroll2<TryToAvoidTailLoop> loop{width, kVectorLength}; | |
57 | 30040 | loop.unroll_twice([&](size_t index) { | |
58 | 10800 | vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow); | |
59 | 10800 | }); | |
60 | |||
61 | 77552 | loop.tail([&](size_t index) { | |
62 | 116624 | scalar_path(src_row, y_row, u_row, v_row, v_first, index, width, | |
63 | 58312 | evenRow); | |
64 | 58312 | }); | |
65 | 19240 | } | |
66 | |||
67 | 376 | return KLEIDICV_OK; | |
68 | 376 | } | |
69 | |||
70 | private: | ||
71 | 10800 | static void vector_path_2x(const uint8_t *src_row, uint8_t *y_row, | |
72 | uint8_t *u_row, uint8_t *v_row, const bool v_first, | ||
73 | const size_t index, const bool evenRow) { | ||
74 | 10800 | uint32x4_t r0[4], g0[4], b0[4], r1[4], g1[4], b1[4]; | |
75 | |||
76 | 10800 | load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, index); | |
77 | |||
78 | 10800 | uint8x16_t y0 = rgb_to_y(r0, g0, b0); | |
79 | |||
80 | 10800 | uint8x16_t y1 = rgb_to_y(r1, g1, b1); | |
81 | |||
82 | 10800 | vst1q_u8(y_row + index, y0); | |
83 | 10800 | vst1q_u8(y_row + index + kVectorLength, y1); | |
84 | |||
85 | // U and V are subsampled by a factor of 2 in both horizontal and vertical | ||
86 | // directions for YUV420 format. Therefore, we only compute U and V from | ||
87 | // even rows and even columns. When the input RGB image has an odd width or | ||
88 | // height, the chroma (U and V) dimensions are rounded up. For example, if | ||
89 | // the height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 | ||
90 | // = 4.5 -> rounded up). The same rounding is applied for width. | ||
91 |
8/8✓ Branch 0 taken 1300 times.
✓ Branch 1 taken 1400 times.
✓ Branch 2 taken 1300 times.
✓ Branch 3 taken 1400 times.
✓ Branch 4 taken 1300 times.
✓ Branch 5 taken 1400 times.
✓ Branch 6 taken 1300 times.
✓ Branch 7 taken 1400 times.
|
10800 | if (evenRow) { |
92 | 5600 | uint8x16x2_t uv; | |
93 | 5600 | int32x4_t r_even[4] = {r0[0], r0[2], r1[0], r1[2]}; | |
94 | 5600 | int32x4_t g_even[4] = {g0[0], g0[2], g1[0], g1[2]}; | |
95 | 5600 | int32x4_t b_even[4] = {b0[0], b0[2], b1[0], b1[2]}; | |
96 | 5600 | rgb_to_uv_2x(r_even, g_even, b_even, uv.val[0], uv.val[1]); | |
97 | |||
98 |
8/8✓ Branch 0 taken 700 times.
✓ Branch 1 taken 700 times.
✓ Branch 2 taken 700 times.
✓ Branch 3 taken 700 times.
✓ Branch 4 taken 700 times.
✓ Branch 5 taken 700 times.
✓ Branch 6 taken 700 times.
✓ Branch 7 taken 700 times.
|
5600 | if (v_first) { |
99 | 2800 | std::swap(uv.val[0], uv.val[1]); | |
100 | 2800 | } | |
101 | |||
102 | if constexpr (kInterleave) { | ||
103 | 2800 | vst2q_u8(u_row + index, uv); | |
104 | } else { | ||
105 | 2800 | vst1q_u8(u_row + index / 2, uv.val[0]); | |
106 | 2800 | vst1q_u8(v_row + index / 2, uv.val[1]); | |
107 | } | ||
108 | 5600 | } | |
109 | 10800 | } | |
110 | |||
111 | 58312 | static void scalar_path(const uint8_t *src_row, uint8_t *y_row, | |
112 | uint8_t *u_row, uint8_t *v_row, const bool v_first, | ||
113 | size_t index, const size_t length, | ||
114 | const bool evenRow) { | ||
115 | 58312 | const size_t u_index_ = v_first; | |
116 | 58312 | const size_t v_index_ = !v_first; | |
117 | |||
118 |
8/8✓ Branch 0 taken 65246 times.
✓ Branch 1 taken 14578 times.
✓ Branch 2 taken 65246 times.
✓ Branch 3 taken 14578 times.
✓ Branch 4 taken 65246 times.
✓ Branch 5 taken 14578 times.
✓ Branch 6 taken 65246 times.
✓ Branch 7 taken 14578 times.
|
319296 | for (; index < length; index += 1) { |
119 | 260984 | uint8_t b0{}, g0{}, r0{}; | |
120 | 260984 | bool evenCol = (index & 1) == 0; | |
121 | 260984 | b0 = src_row[index * scn + b_index_]; | |
122 | 260984 | g0 = src_row[index * scn + g_index_]; | |
123 | 260984 | r0 = src_row[index * scn + r_index_]; | |
124 | |||
125 | 260984 | uint8_t y0 = rgb_to_y(r0, g0, b0); | |
126 | 260984 | y_row[index] = y0; | |
127 | |||
128 | // U and V are subsampled by a factor of 2 in both horizontal and vertical | ||
129 | // directions | ||
130 | // for YUV420 format. Therefore, we only compute U and V from even rows | ||
131 | // and even columns. When the input RGB image has an odd width or height, | ||
132 | // the chroma (U and V) dimensions are rounded up. For example, if the | ||
133 | // height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 = 4.5 | ||
134 | // -> rounded up). The same rounding is applied for width. | ||
135 |
16/16✓ Branch 0 taken 33384 times.
✓ Branch 1 taken 31862 times.
✓ Branch 2 taken 18092 times.
✓ Branch 3 taken 15292 times.
✓ Branch 4 taken 33384 times.
✓ Branch 5 taken 31862 times.
✓ Branch 6 taken 18092 times.
✓ Branch 7 taken 15292 times.
✓ Branch 8 taken 33384 times.
✓ Branch 9 taken 31862 times.
✓ Branch 10 taken 18092 times.
✓ Branch 11 taken 15292 times.
✓ Branch 12 taken 33384 times.
✓ Branch 13 taken 31862 times.
✓ Branch 14 taken 18092 times.
✓ Branch 15 taken 15292 times.
|
260984 | if (evenRow && evenCol) { |
136 | 61168 | uint8_t uv[2] = {0, 0}; | |
137 | 61168 | rgb_to_uv(r0, g0, b0, uv); | |
138 | if constexpr (kInterleave) { | ||
139 | 37248 | u_row[index] = uv[u_index_]; | |
140 | 37248 | u_row[index + 1] = uv[v_index_]; | |
141 | } else { | ||
142 | 23920 | u_row[(index + 1) / 2] = uv[u_index_]; | |
143 | 23920 | v_row[(index + 1) / 2] = uv[v_index_]; | |
144 | } | ||
145 | 61168 | } | |
146 | 260984 | } | |
147 | 58312 | } | |
148 | |||
149 | 260984 | static uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) { | |
150 | 260984 | const int kShifted16 = (16 << kWeightScale); | |
151 | 260984 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
152 | 521968 | int yy = | |
153 | 260984 | kRYWeight * r + kGYWeight * g + kBYWeight * b + kHalfShift + kShifted16; | |
154 | |||
155 | 521968 | return std::clamp(yy >> kWeightScale, 0, 0xff); | |
156 | 260984 | } | |
157 | |||
158 | 21600 | static uint8x16_t rgb_to_y(const uint32x4_t r[4], const uint32x4_t g[4], | |
159 | const uint32x4_t b[4]) { | ||
160 | 21600 | const int kShifted16 = (16 << kWeightScale); | |
161 | 21600 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
162 | |||
163 | // Y = kR*R + kG*G + kB*B + rounding bias | ||
164 | 21600 | uint32x4_t v_kRYWeight = vdupq_n_u32(kRYWeight); | |
165 | 21600 | uint32x4_t v_kGYWeight = vdupq_n_u32(kGYWeight); | |
166 | 21600 | uint32x4_t v_kBYWeight = vdupq_n_u32(kBYWeight); | |
167 | 21600 | uint32x4_t y[4]; | |
168 | |||
169 | KLEIDICV_FORCE_LOOP_UNROLL | ||
170 |
8/8✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 21600 times.
✓ Branch 2 taken 5400 times.
✓ Branch 3 taken 21600 times.
✓ Branch 4 taken 5400 times.
✓ Branch 5 taken 21600 times.
✓ Branch 6 taken 5400 times.
✓ Branch 7 taken 21600 times.
|
108000 | for (int i = 0; i < 4; i++) { |
171 | 86400 | y[i] = vdupq_n_u32(kHalfShift + kShifted16); | |
172 | 86400 | y[i] = vmlaq_u32(y[i], r[i], v_kRYWeight); | |
173 | 86400 | y[i] = vmlaq_u32(y[i], g[i], v_kGYWeight); | |
174 | 86400 | y[i] = vmlaq_u32(y[i], b[i], v_kBYWeight); | |
175 | 86400 | } | |
176 | |||
177 | 43200 | return normalize_and_pack_y(y); | |
178 | 21600 | } | |
179 | |||
180 | 61168 | static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t uv[2]) { | |
181 | 61168 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
182 | 61168 | const int kShifted128 = (128 << kWeightScale); | |
183 | 61168 | int uu = kRUWeight * r + kGUWeight * g + kBUWeight * b + kHalfShift + | |
184 | kShifted128; | ||
185 | 61168 | int vv = kBUWeight * r + kGVWeight * g + kBVWeight * b + kHalfShift + | |
186 | kShifted128; | ||
187 | |||
188 | 61168 | uv[0] = std::clamp(uu >> kWeightScale, 0, 0xff); | |
189 | 61168 | uv[1] = std::clamp(vv >> kWeightScale, 0, 0xff); | |
190 | 61168 | } | |
191 | |||
192 | 11200 | static uint8x16_t compute_u_or_v_2x(const int32x4_t r[4], | |
193 | const int32x4_t g[4], | ||
194 | const int32x4_t b[4], const int r_coeff, | ||
195 | const int g_coeff, const int b_coeff) { | ||
196 | // Constants for U/V calculation | ||
197 | 11200 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
198 | 11200 | const int kShifted128 = (128 << kWeightScale); | |
199 | |||
200 | 11200 | int32x4_t v_r_coeff = vdupq_n_s32(r_coeff); | |
201 | 11200 | int32x4_t v_g_coeff = vdupq_n_s32(g_coeff); | |
202 | 11200 | int32x4_t v_b_coeff = vdupq_n_s32(b_coeff); | |
203 | 11200 | int32x4_t uv[4]; | |
204 | |||
205 | KLEIDICV_FORCE_LOOP_UNROLL | ||
206 |
8/8✓ Branch 0 taken 2800 times.
✓ Branch 1 taken 11200 times.
✓ Branch 2 taken 2800 times.
✓ Branch 3 taken 11200 times.
✓ Branch 4 taken 2800 times.
✓ Branch 5 taken 11200 times.
✓ Branch 6 taken 2800 times.
✓ Branch 7 taken 11200 times.
|
56000 | for (int i = 0; i < 4; i++) { |
207 | 44800 | uv[i] = vdupq_n_s32(kHalfShift + kShifted128); | |
208 | 44800 | uv[i] = vmlaq_s32(uv[i], r[i], v_r_coeff); | |
209 | 44800 | uv[i] = vmlaq_s32(uv[i], g[i], v_g_coeff); | |
210 | 44800 | uv[i] = vmlaq_s32(uv[i], b[i], v_b_coeff); | |
211 | 44800 | } | |
212 | |||
213 | 22400 | return normalize_and_pack_u_or_v(uv); | |
214 | 11200 | } | |
215 | |||
216 | 5600 | static void rgb_to_uv_2x(const int32x4_t r[4], const int32x4_t g[4], | |
217 | const int32x4_t b[4], uint8x16_t &u, uint8x16_t &v) { | ||
218 | // ---------------- U (Cb) Component ---------------- | ||
219 | // U = R * kRU + G * kGU + B * kBU + bias | ||
220 | 5600 | u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight); | |
221 | |||
222 | // ---------------- V (Cr) Component ---------------- | ||
223 | // V = R * kBU + G * kGV + B * kBV + bias | ||
224 | 5600 | v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight); | |
225 | 5600 | } | |
226 | |||
227 | 21600 | static uint8x16_t normalize_and_pack_y(uint32x4_t vec[4]) { | |
228 | // The y_index table selects the correct output order after normalization. | ||
229 | // When we load and separate the RGB values for UV calculation, we | ||
230 | // deinterleave them into even and odd components. As a result, the | ||
231 | // processed values are stored in two separate vectors. During | ||
232 | // normalization, we need to interleave them again to produce the final | ||
233 | // contiguous output, and this index pattern achieves that. | ||
234 | 21600 | uint8x16_t y_index = {1, 17, 3, 19, 5, 21, 7, 23, | |
235 | 9, 25, 11, 27, 13, 29, 15, 31}; | ||
236 | |||
237 | // Normalize down by right-shifting the fixed-point result | ||
238 | // vshrn_n can only shift by an immediate value between 1 and 16. | ||
239 | // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12 | ||
240 | // bits. This ensures that the most relevant 8-bit result lies in the second | ||
241 | // byte of each 16-bit element. As a result, the lookup tables are | ||
242 | // constructed with only odd indices to extract the second byte from each | ||
243 | // element. | ||
244 | 21600 | uint16x4_t tmp_lo_lo = vshrn_n_u32(vec[0], kWeightScale - 8); | |
245 | 43200 | uint16x8_t tmp_lo_hi = | |
246 | 21600 | vshrn_high_n_u32(tmp_lo_lo, vec[2], kWeightScale - 8); | |
247 | 21600 | uint16x4_t tmp_hi_lo = vshrn_n_u32(vec[1], kWeightScale - 8); | |
248 | 43200 | uint16x8_t tmp_hi_hi = | |
249 | 21600 | vshrn_high_n_u32(tmp_hi_lo, vec[3], kWeightScale - 8); | |
250 | |||
251 | 21600 | uint8x16x2_t tmp; | |
252 | 21600 | tmp.val[0] = vreinterpretq_u8(tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14 | |
253 | 21600 | tmp.val[1] = vreinterpretq_u8(tmp_hi_hi); // 1, 3, 5, 7, 9, 11, 13, 15 | |
254 | |||
255 | 21600 | uint8x16_t output = vqtbl2q_u8(tmp, y_index); | |
256 | |||
257 | 43200 | return output; | |
258 | 21600 | } | |
259 | |||
260 | 11200 | static uint8x16_t normalize_and_pack_u_or_v(int32x4_t vec[4]) { | |
261 | // The uv_index table is used to finalize the order of U and V values. | ||
262 | // Unlike the Y component, we don't need to interleave even and odd elements | ||
263 | // manually. This is because the first vector already contains even-indexed | ||
264 | // values from the lower RGB block, and the second vector contains | ||
265 | // even-indexed values from the higher RGB block. As a result, the values | ||
266 | // are already sorted in the correct order for output. | ||
267 | 11200 | uint8x16_t uv_index = {1, 3, 5, 7, 9, 11, 13, 15, | |
268 | 17, 19, 21, 23, 25, 27, 29, 31}; | ||
269 | |||
270 | // Normalize down by right-shifting the fixed-point result | ||
271 | // vshrn_n can only shift by an immediate value between 1 and 16. | ||
272 | // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12 | ||
273 | // bits. This ensures that the most relevant 8-bit result lies in the second | ||
274 | // byte of each 16-bit element. As a result, the lookup tables are | ||
275 | // constructed with only odd indices to extract the second byte from each | ||
276 | // element. | ||
277 | 11200 | int16x4_t tmp_lo_lo = vshrn_n_s32(vec[0], kWeightScale - 8); | |
278 | 11200 | int16x8_t tmp_lo_hi = vshrn_high_n_s32(tmp_lo_lo, vec[1], kWeightScale - 8); | |
279 | 11200 | int16x4_t tmp_hi_lo = vshrn_n_s32(vec[2], kWeightScale - 8); | |
280 | 11200 | int16x8_t tmp_hi_hi = vshrn_high_n_s32(tmp_hi_lo, vec[3], kWeightScale - 8); | |
281 | |||
282 | 11200 | uint8x16x2_t tmp; | |
283 | 11200 | tmp.val[0] = vreinterpretq_u8( | |
284 | 11200 | tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the first vector | |
285 | 11200 | tmp.val[1] = vreinterpretq_u8( | |
286 | 11200 | tmp_hi_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the second vector | |
287 | 11200 | uint8x16_t output = vqtbl2q_u8(tmp, uv_index); | |
288 | |||
289 | 22400 | return output; | |
290 | 11200 | } | |
291 | |||
292 | 10800 | static void load_rgb_2x(uint32x4_t r0[4], uint32x4_t g0[4], uint32x4_t b0[4], | |
293 | uint32x4_t r1[4], uint32x4_t g1[4], uint32x4_t b1[4], | ||
294 | const uint8_t *src_row, const size_t index) { | ||
295 | 10800 | uint8x16_t tmp_b0, tmp_b1, tmp_g0, tmp_g1, tmp_r0, tmp_r1; | |
296 | // Load 32 pixels: two vectors of interleaved channels | ||
297 | |||
298 | if constexpr (kAlpha) { | ||
299 | // 4-channel input (RGBA or BGRA) | ||
300 | 5400 | uint8x16x4_t vsrc0 = vld4q_u8(src_row + scn * index); | |
301 | 5400 | uint8x16x4_t vsrc1 = | |
302 | 5400 | vld4q_u8(src_row + scn * index + scn * kVectorLength); | |
303 | |||
304 | 5400 | tmp_b0 = vsrc0.val[b_index_]; | |
305 | 5400 | tmp_g0 = vsrc0.val[g_index_]; | |
306 | 5400 | tmp_r0 = vsrc0.val[r_index_]; | |
307 | |||
308 | 5400 | tmp_b1 = vsrc1.val[b_index_]; | |
309 | 5400 | tmp_g1 = vsrc1.val[g_index_]; | |
310 | 5400 | tmp_r1 = vsrc1.val[r_index_]; | |
311 | 5400 | } else { | |
312 | // 3-channel input (RGB or BGR) | ||
313 | 5400 | uint8x16x3_t vsrc0 = vld3q_u8(src_row + scn * index); | |
314 | 5400 | uint8x16x3_t vsrc1 = | |
315 | 5400 | vld3q_u8(src_row + scn * index + scn * kVectorLength); | |
316 | |||
317 | 5400 | tmp_b0 = vsrc0.val[b_index_]; | |
318 | 5400 | tmp_g0 = vsrc0.val[g_index_]; | |
319 | 5400 | tmp_r0 = vsrc0.val[r_index_]; | |
320 | |||
321 | 5400 | tmp_b1 = vsrc1.val[b_index_]; | |
322 | 5400 | tmp_g1 = vsrc1.val[g_index_]; | |
323 | 5400 | tmp_r1 = vsrc1.val[r_index_]; | |
324 | 5400 | } | |
325 | // After loading the vector, we extend the channels and separate even and | ||
326 | // odd elements. This separation is important for UV calculation, as only | ||
327 | // the even-indexed values are used. | ||
328 | 10800 | uint8x16_t indices[4] = { | |
329 | 0, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 4, 0xff, 0xff, | ||
330 | 0xff, 6, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 3, 0xff, | ||
331 | 0xff, 0xff, 5, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 8, | ||
332 | 0xff, 0xff, 0xff, 10, 0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff, | ||
333 | 14, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, 11, 0xff, 0xff, | ||
334 | 0xff, 13, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff}; | ||
335 | |||
336 | // Expand each 8-bit channel into 32-bit vectors using table lookup and | ||
337 | // reinterpret | ||
338 | KLEIDICV_FORCE_LOOP_UNROLL | ||
339 |
8/8✓ Branch 0 taken 2700 times.
✓ Branch 1 taken 10800 times.
✓ Branch 2 taken 2700 times.
✓ Branch 3 taken 10800 times.
✓ Branch 4 taken 2700 times.
✓ Branch 5 taken 10800 times.
✓ Branch 6 taken 2700 times.
✓ Branch 7 taken 10800 times.
|
54000 | for (int i = 0; i < 4; i++) { |
340 | 43200 | r0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r0, indices[i])); | |
341 | 43200 | g0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g0, indices[i])); | |
342 | 43200 | b0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b0, indices[i])); | |
343 | 43200 | r1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r1, indices[i])); | |
344 | 43200 | g1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g1, indices[i])); | |
345 | 43200 | b1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b1, indices[i])); | |
346 | 43200 | } | |
347 | 10800 | } | |
348 | |||
349 | static constexpr size_t r_index_ = RGB ? 0 : 2; | ||
350 | static constexpr size_t g_index_ = 1; | ||
351 | static constexpr size_t b_index_ = RGB ? 2 : 0; | ||
352 | static constexpr size_t scn = kAlpha ? 4 : 3; | ||
353 | }; | ||
354 | |||
355 | } // namespace kleidicv::neon | ||
356 | |||
357 | #endif // KLEIDICV_RGB_TO_YUV420_H | ||
358 |