Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_YUV420_TO_RGB_NEON_H | ||
6 | #define KLEIDICV_YUV420_TO_RGB_NEON_H | ||
7 | |||
8 | #include <algorithm> | ||
9 | #include <utility> | ||
10 | |||
11 | #include "kleidicv/kleidicv.h" | ||
12 | #include "kleidicv/traits.h" | ||
13 | #include "yuv420_coefficients.h" | ||
14 | |||
15 | namespace kleidicv::neon { | ||
16 | |||
17 | template <bool BGR, bool kAlpha> | ||
18 | class YUV420XToRGBxOrBGRx { | ||
19 | public: | ||
20 | using ScalarType = uint8_t; | ||
21 | using VectorType = uint8x16_t; | ||
22 | |||
23 | int32x4_t y_weight_; | ||
24 | int32x2x2_t uv_weights_; | ||
25 | int32x4_t r_base_, g_base_, b_base_; | ||
26 | int8x16x4_t de_interleave_indices_; | ||
27 | const bool v_first_; | ||
28 | |||
29 | // Returns the number of channels in the output image. | ||
30 | 620 | static constexpr size_t output_channels() { | |
31 | 620 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
32 | } | ||
33 | |||
34 | 8064 | static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) { | |
35 | 16128 | return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)), | |
36 | 8064 | vmovn_s32(vshrq_n_s32(b, kWeightScale))); | |
37 | } | ||
38 | |||
39 | // clang-format off | ||
40 | |||
41 | static constexpr int8_t kDeInterleaveTableIndices[64] = { | ||
42 | /* low and even */ | ||
43 | 0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1, | ||
44 | /* high and even */ | ||
45 | 8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1, | ||
46 | /* low and odd */ | ||
47 | 1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1, | ||
48 | /* high and odd */ | ||
49 | 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1, | ||
50 | }; | ||
51 | |||
52 | // clang-format on | ||
53 | |||
54 | 884 | explicit YUV420XToRGBxOrBGRx(bool v_first) | |
55 | 884 | : y_weight_{vdupq_n_s32(kYWeight)}, | |
56 | 884 | uv_weights_{vld2_s32(kUVWeights)}, | |
57 | 1768 | r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - | |
58 | 884 | 128 * kUVWeights[kRVWeightIndex])}, | |
59 | 1768 | g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - | |
60 | 884 | 128 * (kUVWeights[1] + kUVWeights[2]))}, | |
61 | 884 | b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])}, | |
62 | 884 | de_interleave_indices_{}, | |
63 | 884 | v_first_{v_first} { | |
64 | 884 | neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices, | |
65 | 884 | de_interleave_indices_); | |
66 | 884 | } | |
67 | |||
68 | 672 | void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l, | |
69 | int32x4_t u_h, int32x4_t v_l, int32x4_t v_h, | ||
70 | ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) { | ||
71 | // Y' = saturating(Ya - 16) and widen to 32-bits. | ||
72 | 672 | uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16)); | |
73 | 672 | uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16)); | |
74 | |||
75 | 1344 | uint32x4_t y0_m16_even_l = | |
76 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0])); | |
77 | 1344 | uint32x4_t y0_m16_even_h = | |
78 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1])); | |
79 | 1344 | uint32x4_t y0_m16_odd_l = | |
80 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2])); | |
81 | 1344 | uint32x4_t y0_m16_odd_h = | |
82 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3])); | |
83 | |||
84 | 1344 | uint32x4_t y1_m16_even_l = | |
85 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0])); | |
86 | 1344 | uint32x4_t y1_m16_even_h = | |
87 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1])); | |
88 | 1344 | uint32x4_t y1_m16_odd_l = | |
89 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2])); | |
90 | 1344 | uint32x4_t y1_m16_odd_h = | |
91 | 672 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3])); | |
92 | |||
93 | // Y = Weight(Y) * Y' | ||
94 | 672 | y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_); | |
95 | 672 | y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_); | |
96 | 672 | y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_); | |
97 | 672 | y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_); | |
98 | |||
99 | 672 | y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_); | |
100 | 672 | y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_); | |
101 | 672 | y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_); | |
102 | 672 | y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_); | |
103 | |||
104 | // Swap U and V planes for YV12 layout. | ||
105 |
8/8✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
✓ Branch 2 taken 84 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
✓ Branch 5 taken 84 times.
✓ Branch 6 taken 84 times.
✓ Branch 7 taken 84 times.
|
672 | if (v_first_) { |
106 | 336 | std::swap(u_l, v_l); | |
107 | 336 | std::swap(u_h, v_h); | |
108 | 336 | } | |
109 | |||
110 | // R - Y = Rbase + Weight(RV) * V = | ||
111 | // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V | ||
112 | 672 | int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0); | |
113 | 672 | int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0); | |
114 | |||
115 | // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V = | ||
116 | // Weight(GU) * ((1 << (SCALE - 1)) - 128) + | ||
117 | // Weight(GV) * ((1 << (SCALE - 1)) - 128) + | ||
118 | // Weight(GU) * U + Weight(GV) * V | ||
119 | 672 | int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0); | |
120 | 672 | int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0); | |
121 | 672 | g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1); | |
122 | 672 | g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1); | |
123 | |||
124 | // B - Y = Bbase + Weight(BU) * U = | ||
125 | // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U | ||
126 | 672 | int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1); | |
127 | 672 | int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1); | |
128 | |||
129 | // R = (R - Y) + Y | ||
130 | 672 | int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l); | |
131 | 672 | int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h); | |
132 | 672 | int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l); | |
133 | 672 | int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h); | |
134 | 672 | int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h); | |
135 | 672 | int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h); | |
136 | |||
137 | 672 | int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l); | |
138 | 672 | int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h); | |
139 | 672 | int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l); | |
140 | 672 | int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h); | |
141 | 672 | int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h); | |
142 | 672 | int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h); | |
143 | |||
144 | // G = (G - Y) + Y | ||
145 | 672 | int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l); | |
146 | 672 | int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h); | |
147 | 672 | int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l); | |
148 | 672 | int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h); | |
149 | 672 | int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h); | |
150 | 672 | int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h); | |
151 | |||
152 | 672 | int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l); | |
153 | 672 | int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h); | |
154 | 672 | int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l); | |
155 | 672 | int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h); | |
156 | 672 | int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h); | |
157 | 672 | int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h); | |
158 | |||
159 | // B = (B - Y) + Y | ||
160 | 672 | int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l); | |
161 | 672 | int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h); | |
162 | 672 | int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l); | |
163 | 672 | int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h); | |
164 | 672 | int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h); | |
165 | 672 | int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h); | |
166 | |||
167 | 672 | int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l); | |
168 | 672 | int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h); | |
169 | 672 | int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l); | |
170 | 672 | int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h); | |
171 | 672 | int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h); | |
172 | 672 | int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h); | |
173 | |||
174 | // Zip even and odd RGB pixels. | ||
175 | 672 | uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); | |
176 | 672 | uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); | |
177 | 672 | uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); | |
178 | 672 | uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); | |
179 | 672 | uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); | |
180 | 672 | uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); | |
181 | |||
182 | if constexpr (kAlpha) { | ||
183 | 336 | uint8x16x4_t rgba0, rgba1; | |
184 | // Red channel | ||
185 | 336 | rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); | |
186 | 336 | rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); | |
187 | // Green channel | ||
188 | 336 | rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); | |
189 | 336 | rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); | |
190 | // Blue channel | ||
191 | 336 | rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); | |
192 | 336 | rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); | |
193 | // Alpha channel | ||
194 | 336 | rgba0.val[3] = vdupq_n_u8(0xFF); | |
195 | 336 | rgba1.val[3] = vdupq_n_u8(0xFF); | |
196 | |||
197 | if constexpr (BGR) { | ||
198 | 168 | std::swap(rgba0.val[0], rgba0.val[2]); | |
199 | 168 | std::swap(rgba1.val[0], rgba1.val[2]); | |
200 | } | ||
201 | |||
202 | // Store RGB pixels to memory. | ||
203 | 336 | vst4q_u8(rgbx_row_0, rgba0); | |
204 | 336 | vst4q_u8(rgbx_row_1, rgba1); | |
205 | 336 | } else { | |
206 | 336 | uint8x16x3_t rgb0, rgb1; | |
207 | // Red channel | ||
208 | 336 | rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); | |
209 | 336 | rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); | |
210 | // Green channel | ||
211 | 336 | rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); | |
212 | 336 | rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); | |
213 | // Blue channel | ||
214 | 336 | rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); | |
215 | 336 | rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); | |
216 | |||
217 | if constexpr (BGR) { | ||
218 | 168 | std::swap(rgb0.val[0], rgb0.val[2]); | |
219 | 168 | std::swap(rgb1.val[0], rgb1.val[2]); | |
220 | } | ||
221 | |||
222 | // Store RGB pixels to memory. | ||
223 | 336 | vst3q_u8(rgbx_row_0, rgb0); | |
224 | 336 | vst3q_u8(rgbx_row_1, rgb1); | |
225 | 336 | } | |
226 | 672 | } | |
227 | |||
228 | 22840 | void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128, | |
229 | int32_t v_m128, uint8_t *rgbx_rows[2]) { | ||
230 |
8/8✓ Branch 0 taken 5710 times.
✓ Branch 1 taken 11420 times.
✓ Branch 2 taken 5710 times.
✓ Branch 3 taken 11420 times.
✓ Branch 4 taken 5710 times.
✓ Branch 5 taken 11420 times.
✓ Branch 6 taken 5710 times.
✓ Branch 7 taken 11420 times.
|
68520 | for (size_t selector = 0; selector < 2; ++selector) { |
231 | 45680 | int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0); | |
232 | 45680 | int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128; | |
233 | 91360 | int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 + | |
234 | 45680 | kUVWeights[kGVWeightIndex] * v_m128; | |
235 | 45680 | int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128; | |
236 | |||
237 | 45680 | r = rounding_shift_right(r, kWeightScale); | |
238 | 45680 | g = rounding_shift_right(g, kWeightScale); | |
239 | 45680 | b = rounding_shift_right(b, kWeightScale); | |
240 | |||
241 | if constexpr (BGR) { | ||
242 | 22840 | std::swap(r, b); | |
243 | } | ||
244 | |||
245 | 45680 | rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r); | |
246 | 45680 | rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g); | |
247 | 45680 | rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b); | |
248 | |||
249 | if constexpr (kAlpha) { | ||
250 | 22840 | rgbx_rows[selector][3] = 0xFF; | |
251 | } | ||
252 | |||
253 | 45680 | rgbx_rows[selector] += kAlpha ? 4 : 3; | |
254 | 45680 | } | |
255 | 22840 | } | |
256 | }; | ||
257 | } // namespace kleidicv::neon | ||
258 | |||
259 | #endif // KLEIDICV_YUV420_TO_RGB_NEON_H | ||
260 |