KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv420_to_rgb_neon.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 144 144 100.0%
Functions: 20 20 100.0%
Branches: 16 16 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_YUV420_TO_RGB_NEON_H
6 #define KLEIDICV_YUV420_TO_RGB_NEON_H
7
8 #include <algorithm>
9 #include <utility>
10
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/traits.h"
13 #include "yuv420_coefficients.h"
14
15 namespace kleidicv::neon {
16
17 template <bool BGR, bool kAlpha>
18 class YUV420XToRGBxOrBGRx {
19 public:
20 using ScalarType = uint8_t;
21 using VectorType = uint8x16_t;
22
23 int32x4_t y_weight_;
24 int32x2x2_t uv_weights_;
25 int32x4_t r_base_, g_base_, b_base_;
26 int8x16x4_t de_interleave_indices_;
27 const bool v_first_;
28
29 // Returns the number of channels in the output image.
30 620 static constexpr size_t output_channels() {
31 620 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
32 }
33
34 8064 static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
35 16128 return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)),
36 8064 vmovn_s32(vshrq_n_s32(b, kWeightScale)));
37 }
38
39 // clang-format off
40
41 static constexpr int8_t kDeInterleaveTableIndices[64] = {
42 /* low and even */
43 0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1,
44 /* high and even */
45 8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1,
46 /* low and odd */
47 1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1,
48 /* high and odd */
49 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
50 };
51
52 // clang-format on
53
54 884 explicit YUV420XToRGBxOrBGRx(bool v_first)
55 884 : y_weight_{vdupq_n_s32(kYWeight)},
56 884 uv_weights_{vld2_s32(kUVWeights)},
57 1768 r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
58 884 128 * kUVWeights[kRVWeightIndex])},
59 1768 g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
60 884 128 * (kUVWeights[1] + kUVWeights[2]))},
61 884 b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])},
62 884 de_interleave_indices_{},
63 884 v_first_{v_first} {
64 884 neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices,
65 884 de_interleave_indices_);
66 884 }
67
68 672 void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l,
69 int32x4_t u_h, int32x4_t v_l, int32x4_t v_h,
70 ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
71 // Y' = saturating(Ya - 16) and widen to 32-bits.
72 672 uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16));
73 672 uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16));
74
75 1344 uint32x4_t y0_m16_even_l =
76 672 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0]));
77 1344 uint32x4_t y0_m16_even_h =
78 672 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1]));
79 1344 uint32x4_t y0_m16_odd_l =
80 672 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2]));
81 1344 uint32x4_t y0_m16_odd_h =
82 672 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3]));
83
84 1344 uint32x4_t y1_m16_even_l =
85 672 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0]));
86 1344 uint32x4_t y1_m16_even_h =
87 672 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1]));
88 1344 uint32x4_t y1_m16_odd_l =
89 672 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2]));
90 1344 uint32x4_t y1_m16_odd_h =
91 672 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3]));
92
93 // Y = Weight(Y) * Y'
94 672 y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_);
95 672 y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_);
96 672 y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_);
97 672 y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_);
98
99 672 y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_);
100 672 y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_);
101 672 y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_);
102 672 y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_);
103
104 // Swap U and V planes for YV12 layout.
105
8/8
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
✓ Branch 2 taken 84 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
✓ Branch 5 taken 84 times.
✓ Branch 6 taken 84 times.
✓ Branch 7 taken 84 times.
672 if (v_first_) {
106 336 std::swap(u_l, v_l);
107 336 std::swap(u_h, v_h);
108 336 }
109
110 // R - Y = Rbase + Weight(RV) * V =
111 // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
112 672 int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0);
113 672 int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0);
114
115 // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
116 // Weight(GU) * ((1 << (SCALE - 1)) - 128) +
117 // Weight(GV) * ((1 << (SCALE - 1)) - 128) +
118 // Weight(GU) * U + Weight(GV) * V
119 672 int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0);
120 672 int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0);
121 672 g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1);
122 672 g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1);
123
124 // B - Y = Bbase + Weight(BU) * U =
125 // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
126 672 int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1);
127 672 int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1);
128
129 // R = (R - Y) + Y
130 672 int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l);
131 672 int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h);
132 672 int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l);
133 672 int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h);
134 672 int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h);
135 672 int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h);
136
137 672 int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l);
138 672 int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h);
139 672 int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l);
140 672 int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h);
141 672 int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h);
142 672 int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h);
143
144 // G = (G - Y) + Y
145 672 int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l);
146 672 int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h);
147 672 int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l);
148 672 int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h);
149 672 int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h);
150 672 int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h);
151
152 672 int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l);
153 672 int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h);
154 672 int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l);
155 672 int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h);
156 672 int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h);
157 672 int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h);
158
159 // B = (B - Y) + Y
160 672 int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l);
161 672 int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h);
162 672 int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l);
163 672 int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h);
164 672 int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h);
165 672 int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h);
166
167 672 int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l);
168 672 int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h);
169 672 int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l);
170 672 int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h);
171 672 int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h);
172 672 int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h);
173
174 // Zip even and odd RGB pixels.
175 672 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
176 672 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
177 672 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
178 672 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
179 672 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
180 672 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
181
182 if constexpr (kAlpha) {
183 336 uint8x16x4_t rgba0, rgba1;
184 // Red channel
185 336 rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
186 336 rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
187 // Green channel
188 336 rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
189 336 rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
190 // Blue channel
191 336 rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
192 336 rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
193 // Alpha channel
194 336 rgba0.val[3] = vdupq_n_u8(0xFF);
195 336 rgba1.val[3] = vdupq_n_u8(0xFF);
196
197 if constexpr (BGR) {
198 168 std::swap(rgba0.val[0], rgba0.val[2]);
199 168 std::swap(rgba1.val[0], rgba1.val[2]);
200 }
201
202 // Store RGB pixels to memory.
203 336 vst4q_u8(rgbx_row_0, rgba0);
204 336 vst4q_u8(rgbx_row_1, rgba1);
205 336 } else {
206 336 uint8x16x3_t rgb0, rgb1;
207 // Red channel
208 336 rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
209 336 rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
210 // Green channel
211 336 rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
212 336 rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
213 // Blue channel
214 336 rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
215 336 rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
216
217 if constexpr (BGR) {
218 168 std::swap(rgb0.val[0], rgb0.val[2]);
219 168 std::swap(rgb1.val[0], rgb1.val[2]);
220 }
221
222 // Store RGB pixels to memory.
223 336 vst3q_u8(rgbx_row_0, rgb0);
224 336 vst3q_u8(rgbx_row_1, rgb1);
225 336 }
226 672 }
227
228 22840 void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128,
229 int32_t v_m128, uint8_t *rgbx_rows[2]) {
230
8/8
✓ Branch 0 taken 5710 times.
✓ Branch 1 taken 11420 times.
✓ Branch 2 taken 5710 times.
✓ Branch 3 taken 11420 times.
✓ Branch 4 taken 5710 times.
✓ Branch 5 taken 11420 times.
✓ Branch 6 taken 5710 times.
✓ Branch 7 taken 11420 times.
68520 for (size_t selector = 0; selector < 2; ++selector) {
231 45680 int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0);
232 45680 int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128;
233 91360 int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 +
234 45680 kUVWeights[kGVWeightIndex] * v_m128;
235 45680 int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128;
236
237 45680 r = rounding_shift_right(r, kWeightScale);
238 45680 g = rounding_shift_right(g, kWeightScale);
239 45680 b = rounding_shift_right(b, kWeightScale);
240
241 if constexpr (BGR) {
242 22840 std::swap(r, b);
243 }
244
245 45680 rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r);
246 45680 rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
247 45680 rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b);
248
249 if constexpr (kAlpha) {
250 22840 rgbx_rows[selector][3] = 0xFF;
251 }
252
253 45680 rgbx_rows[selector] += kAlpha ? 4 : 3;
254 45680 }
255 22840 }
256 };
257 } // namespace kleidicv::neon
258
259 #endif // KLEIDICV_YUV420_TO_RGB_NEON_H
260