KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv420_to_rgb_neon.h
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 144 144 100.0%
Functions: 20 20 100.0%
Branches: 16 16 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_YUV420_TO_RGB_NEON_H
6 #define KLEIDICV_YUV420_TO_RGB_NEON_H
7
8 #include <algorithm>
9 #include <utility>
10
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/traits.h"
13 #include "yuv42x_coefficients.h"
14
15 namespace kleidicv::neon {
16
17 template <bool BGR, bool kAlpha>
18 class YUV420XToRGBxOrBGRx {
19 public:
20 using ScalarType = uint8_t;
21 using VectorType = uint8x16_t;
22
23 int32x4_t y_weight_;
24 int32x2x2_t uv_weights_;
25 int32x4_t r_base_, g_base_, b_base_;
26 int8x16x4_t de_interleave_indices_;
27 const bool v_first_;
28
29 // Returns the number of channels in the output image.
30 984 static constexpr size_t output_channels() {
31 984 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
32 }
33
34 KLEIDICV_FORCE_INLINE
35 4608 static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
36 9216 return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)),
37 4608 vmovn_s32(vshrq_n_s32(b, kWeightScale)));
38 }
39
40 // clang-format off
41
42 static constexpr int8_t kDeInterleaveTableIndices[64] = {
43 /* low and even */
44 0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1,
45 /* high and even */
46 8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1,
47 /* low and odd */
48 1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1,
49 /* high and odd */
50 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
51 };
52
53 // clang-format on
54
55 1208 explicit YUV420XToRGBxOrBGRx(bool v_first)
56 1208 : y_weight_{vdupq_n_s32(kYWeight)},
57 1208 uv_weights_{vld2_s32(kUVWeights)},
58 2416 r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
59 1208 128 * kUVWeights[kRVWeightIndex])},
60 2416 g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
61 1208 128 * (kUVWeights[1] + kUVWeights[2]))},
62 1208 b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])},
63 1208 de_interleave_indices_{},
64 1208 v_first_{v_first} {
65 1208 neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices,
66 1208 de_interleave_indices_);
67 1208 }
68
69 KLEIDICV_FORCE_INLINE
70 384 void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l,
71 int32x4_t u_h, int32x4_t v_l, int32x4_t v_h,
72 ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
73 // Y' = saturating(Ya - 16) and widen to 32-bits.
74 384 uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16));
75 384 uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16));
76
77 768 uint32x4_t y0_m16_even_l =
78 384 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0]));
79 768 uint32x4_t y0_m16_even_h =
80 384 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1]));
81 768 uint32x4_t y0_m16_odd_l =
82 384 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2]));
83 768 uint32x4_t y0_m16_odd_h =
84 384 vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3]));
85
86 768 uint32x4_t y1_m16_even_l =
87 384 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0]));
88 768 uint32x4_t y1_m16_even_h =
89 384 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1]));
90 768 uint32x4_t y1_m16_odd_l =
91 384 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2]));
92 768 uint32x4_t y1_m16_odd_h =
93 384 vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3]));
94
95 // Y = Weight(Y) * Y'
96 384 y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_);
97 384 y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_);
98 384 y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_);
99 384 y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_);
100
101 384 y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_);
102 384 y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_);
103 384 y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_);
104 384 y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_);
105
106 // Swap U and V planes for YV12 layout.
107
8/8
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 48 times.
✓ Branch 3 taken 48 times.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 48 times.
✓ Branch 7 taken 48 times.
384 if (v_first_) {
108 192 std::swap(u_l, v_l);
109 192 std::swap(u_h, v_h);
110 192 }
111
112 // R - Y = Rbase + Weight(RV) * V =
113 // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
114 384 int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0);
115 384 int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0);
116
117 // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
118 // Weight(GU) * ((1 << (SCALE - 1)) - 128) +
119 // Weight(GV) * ((1 << (SCALE - 1)) - 128) +
120 // Weight(GU) * U + Weight(GV) * V
121 384 int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0);
122 384 int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0);
123 384 g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1);
124 384 g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1);
125
126 // B - Y = Bbase + Weight(BU) * U =
127 // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
128 384 int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1);
129 384 int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1);
130
131 // R = (R - Y) + Y
132 384 int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l);
133 384 int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h);
134 384 int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l);
135 384 int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h);
136 384 int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h);
137 384 int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h);
138
139 384 int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l);
140 384 int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h);
141 384 int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l);
142 384 int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h);
143 384 int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h);
144 384 int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h);
145
146 // G = (G - Y) + Y
147 384 int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l);
148 384 int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h);
149 384 int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l);
150 384 int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h);
151 384 int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h);
152 384 int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h);
153
154 384 int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l);
155 384 int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h);
156 384 int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l);
157 384 int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h);
158 384 int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h);
159 384 int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h);
160
161 // B = (B - Y) + Y
162 384 int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l);
163 384 int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h);
164 384 int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l);
165 384 int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h);
166 384 int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h);
167 384 int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h);
168
169 384 int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l);
170 384 int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h);
171 384 int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l);
172 384 int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h);
173 384 int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h);
174 384 int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h);
175
176 // Zip even and odd RGB pixels.
177 384 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
178 384 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
179 384 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
180 384 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
181 384 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
182 384 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
183
184 if constexpr (kAlpha) {
185 192 uint8x16x4_t rgba0, rgba1;
186 // Red channel
187 192 rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
188 192 rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
189 // Green channel
190 192 rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
191 192 rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
192 // Blue channel
193 192 rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
194 192 rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
195 // Alpha channel
196 192 rgba0.val[3] = vdupq_n_u8(0xFF);
197 192 rgba1.val[3] = vdupq_n_u8(0xFF);
198
199 if constexpr (BGR) {
200 96 std::swap(rgba0.val[0], rgba0.val[2]);
201 96 std::swap(rgba1.val[0], rgba1.val[2]);
202 }
203
204 // Store RGB pixels to memory.
205 192 vst4q_u8(rgbx_row_0, rgba0);
206 192 vst4q_u8(rgbx_row_1, rgba1);
207 192 } else {
208 192 uint8x16x3_t rgb0, rgb1;
209 // Red channel
210 192 rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
211 192 rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
212 // Green channel
213 192 rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
214 192 rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
215 // Blue channel
216 192 rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
217 192 rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
218
219 if constexpr (BGR) {
220 96 std::swap(rgb0.val[0], rgb0.val[2]);
221 96 std::swap(rgb1.val[0], rgb1.val[2]);
222 }
223
224 // Store RGB pixels to memory.
225 192 vst3q_u8(rgbx_row_0, rgb0);
226 192 vst3q_u8(rgbx_row_1, rgb1);
227 192 }
228 384 }
229
230 KLEIDICV_FORCE_INLINE
231 37248 void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128,
232 int32_t v_m128, uint8_t *rgbx_rows[2]) {
233
8/8
✓ Branch 0 taken 9312 times.
✓ Branch 1 taken 18624 times.
✓ Branch 2 taken 9312 times.
✓ Branch 3 taken 18624 times.
✓ Branch 4 taken 9312 times.
✓ Branch 5 taken 18624 times.
✓ Branch 6 taken 9312 times.
✓ Branch 7 taken 18624 times.
111744 for (size_t selector = 0; selector < 2; ++selector) {
234 74496 int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0);
235 74496 int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128;
236 148992 int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 +
237 74496 kUVWeights[kGVWeightIndex] * v_m128;
238 74496 int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128;
239
240 74496 r = rounding_shift_right(r, kWeightScale);
241 74496 g = rounding_shift_right(g, kWeightScale);
242 74496 b = rounding_shift_right(b, kWeightScale);
243
244 if constexpr (BGR) {
245 37248 std::swap(r, b);
246 }
247
248 74496 rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r);
249 74496 rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
250 74496 rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b);
251
252 if constexpr (kAlpha) {
253 37248 rgbx_rows[selector][3] = 0xFF;
254 }
255
256 74496 rgbx_rows[selector] += kAlpha ? 4 : 3;
257 74496 }
258 37248 }
259 };
260 } // namespace kleidicv::neon
261
262 #endif // KLEIDICV_YUV420_TO_RGB_NEON_H
263