KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv420_neon.h
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 180 180 100.0%
Functions: 104 104 100.0%
Branches: 80 80 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RGB_TO_YUV420_H
6 #define KLEIDICV_RGB_TO_YUV420_H
7
8 #include <algorithm>
9 #include <utility>
10
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/neon.h"
13 #include "yuv42x_coefficients.h"
14
15 namespace kleidicv::neon {
16
17 template <bool kAlpha, bool RGB, bool kInterleave>
18 class RGBxorBGRxToYUV420 {
19 public:
20 880 static kleidicv_error_t rgb2yuv420_operation(
21 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
22 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
23 bool v_first, size_t begin, size_t end) {
24 880 size_t row_begin = begin * 2;
25 880 size_t row_end = std::min<size_t>(height, end * 2);
26
27 880 const uint8_t *src_row = nullptr;
28 880 uint8_t *y_row = nullptr;
29 880 uint8_t *u_row = nullptr;
30 880 uint8_t *v_row = nullptr;
31
8/8
✓ Branch 0 taken 220 times.
✓ Branch 1 taken 9242 times.
✓ Branch 2 taken 220 times.
✓ Branch 3 taken 9242 times.
✓ Branch 4 taken 220 times.
✓ Branch 5 taken 9242 times.
✓ Branch 6 taken 220 times.
✓ Branch 7 taken 9242 times.
37848 for (size_t h = row_begin; h < row_end; h++) {
32 36968 src_row = src + src_stride * h;
33 36968 y_row = y_dst + y_stride * h;
34
35 36968 bool evenRow = (h & 1) == 0;
36
37
8/8
✓ Branch 0 taken 4582 times.
✓ Branch 1 taken 4660 times.
✓ Branch 2 taken 4582 times.
✓ Branch 3 taken 4660 times.
✓ Branch 4 taken 4582 times.
✓ Branch 5 taken 4660 times.
✓ Branch 6 taken 4582 times.
✓ Branch 7 taken 4660 times.
36968 if (evenRow) {
38 if constexpr (kInterleave) {
39 9376 u_row = uv_dst + uv_stride * (h / 2);
40 } else {
41 9264 u_row =
42 9264 uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2);
43 // Pointer to the start of the V plane.
44 // The V plane follows the U plane. Both U and V planes are
45 // subsampled at a 2:1 vertical ratio (i.e., each has height / 2
46 // rows) and stored in a single contiguous chroma region.
47 // Depending on image height and stride, the starting offset
48 // of V may require adjustment, so a
49 // fractional offset (in rows) is applied to calculate the V plane
50 // position.
51 18528 v_row = uv_dst + uv_stride * ((h + height + 1) / 4) +
52 9264 (((h + height + 1) / 2) % 2) * ((width + 1) / 2);
53 }
54 18640 }
55
56 36968 LoopUnroll2<TryToAvoidTailLoop> loop{width, kVectorLength};
57 47768 loop.unroll_twice([&](size_t index) {
58 10800 vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow);
59 10800 });
60
61 142360 loop.tail([&](size_t index) {
62 210784 scalar_path(src_row, y_row, u_row, v_row, v_first, index, width,
63 105392 evenRow);
64 105392 });
65 36968 }
66
67 880 return KLEIDICV_OK;
68 880 }
69
70 private:
71 KLEIDICV_FORCE_INLINE
72 10800 static void vector_path_2x(const uint8_t *src_row, uint8_t *y_row,
73 uint8_t *u_row, uint8_t *v_row, const bool v_first,
74 const size_t index, const bool evenRow) {
75 10800 uint32x4_t r0[4], g0[4], b0[4], r1[4], g1[4], b1[4];
76
77 10800 load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, index);
78
79 10800 uint8x16_t y0 = rgb_to_y(r0, g0, b0);
80
81 10800 uint8x16_t y1 = rgb_to_y(r1, g1, b1);
82
83 10800 vst1q_u8(y_row + index, y0);
84 10800 vst1q_u8(y_row + index + kVectorLength, y1);
85
86 // U and V are subsampled by a factor of 2 in both horizontal and vertical
87 // directions for YUV420 format. Therefore, we only compute U and V from
88 // even rows and even columns. When the input RGB image has an odd width or
89 // height, the chroma (U and V) dimensions are rounded up. For example, if
90 // the height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2
91 // = 4.5 -> rounded up). The same rounding is applied for width.
92
8/8
✓ Branch 0 taken 1300 times.
✓ Branch 1 taken 1400 times.
✓ Branch 2 taken 1300 times.
✓ Branch 3 taken 1400 times.
✓ Branch 4 taken 1300 times.
✓ Branch 5 taken 1400 times.
✓ Branch 6 taken 1300 times.
✓ Branch 7 taken 1400 times.
10800 if (evenRow) {
93 5600 uint8x16x2_t uv;
94 5600 int32x4_t r_even[4] = {r0[0], r0[2], r1[0], r1[2]};
95 5600 int32x4_t g_even[4] = {g0[0], g0[2], g1[0], g1[2]};
96 5600 int32x4_t b_even[4] = {b0[0], b0[2], b1[0], b1[2]};
97 5600 rgb_to_uv_2x(r_even, g_even, b_even, uv.val[0], uv.val[1]);
98
99
8/8
✓ Branch 0 taken 700 times.
✓ Branch 1 taken 700 times.
✓ Branch 2 taken 700 times.
✓ Branch 3 taken 700 times.
✓ Branch 4 taken 700 times.
✓ Branch 5 taken 700 times.
✓ Branch 6 taken 700 times.
✓ Branch 7 taken 700 times.
5600 if (v_first) {
100 2800 std::swap(uv.val[0], uv.val[1]);
101 2800 }
102
103 if constexpr (kInterleave) {
104 2800 vst2q_u8(u_row + index, uv);
105 } else {
106 2800 vst1q_u8(u_row + index / 2, uv.val[0]);
107 2800 vst1q_u8(v_row + index / 2, uv.val[1]);
108 }
109 5600 }
110 10800 }
111
112 105392 static void scalar_path(const uint8_t *src_row, uint8_t *y_row,
113 uint8_t *u_row, uint8_t *v_row, const bool v_first,
114 size_t index, const size_t length,
115 const bool evenRow) {
116 105392 const size_t u_index_ = v_first;
117 105392 const size_t v_index_ = !v_first;
118
119
8/8
✓ Branch 0 taken 100198 times.
✓ Branch 1 taken 26348 times.
✓ Branch 2 taken 100198 times.
✓ Branch 3 taken 26348 times.
✓ Branch 4 taken 100198 times.
✓ Branch 5 taken 26348 times.
✓ Branch 6 taken 100198 times.
✓ Branch 7 taken 26348 times.
506184 for (; index < length; index += 1) {
120 400792 uint8_t b0{}, g0{}, r0{};
121 400792 bool evenCol = (index & 1) == 0;
122 400792 b0 = src_row[index * scn + b_index_];
123 400792 g0 = src_row[index * scn + g_index_];
124 400792 r0 = src_row[index * scn + r_index_];
125
126 400792 uint8_t y0 = rgb_to_y(r0, g0, b0);
127 400792 y_row[index] = y0;
128
129 // U and V are subsampled by a factor of 2 in both horizontal and vertical
130 // directions
131 // for YUV420 format. Therefore, we only compute U and V from even rows
132 // and even columns. When the input RGB image has an odd width or height,
133 // the chroma (U and V) dimensions are rounded up. For example, if the
134 // height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 = 4.5
135 // -> rounded up). The same rounding is applied for width.
136
16/16
✓ Branch 0 taken 51060 times.
✓ Branch 1 taken 49138 times.
✓ Branch 2 taken 28400 times.
✓ Branch 3 taken 22660 times.
✓ Branch 4 taken 51060 times.
✓ Branch 5 taken 49138 times.
✓ Branch 6 taken 28400 times.
✓ Branch 7 taken 22660 times.
✓ Branch 8 taken 51060 times.
✓ Branch 9 taken 49138 times.
✓ Branch 10 taken 28400 times.
✓ Branch 11 taken 22660 times.
✓ Branch 12 taken 51060 times.
✓ Branch 13 taken 49138 times.
✓ Branch 14 taken 28400 times.
✓ Branch 15 taken 22660 times.
400792 if (evenRow && evenCol) {
137 90640 uint8_t uv[2] = {0, 0};
138 90640 rgb_to_uv(r0, g0, b0, uv);
139 if constexpr (kInterleave) {
140 51984 u_row[index] = uv[u_index_];
141 51984 u_row[index + 1] = uv[v_index_];
142 } else {
143 38656 u_row[(index + 1) / 2] = uv[u_index_];
144 38656 v_row[(index + 1) / 2] = uv[v_index_];
145 }
146 90640 }
147 400792 }
148 105392 }
149
150 400792 static uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) {
151 400792 const int kShifted16 = (16 << kWeightScale);
152 400792 const int kHalfShift = (1 << (kWeightScale - 1));
153 801584 int yy =
154 400792 kRYWeight * r + kGYWeight * g + kBYWeight * b + kHalfShift + kShifted16;
155
156 801584 return std::clamp(yy >> kWeightScale, 0, 0xff);
157 400792 }
158
159 21600 static uint8x16_t rgb_to_y(const uint32x4_t r[4], const uint32x4_t g[4],
160 const uint32x4_t b[4]) {
161 21600 const int kShifted16 = (16 << kWeightScale);
162 21600 const int kHalfShift = (1 << (kWeightScale - 1));
163
164 // Y = kR*R + kG*G + kB*B + rounding bias
165 21600 uint32x4_t v_kRYWeight = vdupq_n_u32(kRYWeight);
166 21600 uint32x4_t v_kGYWeight = vdupq_n_u32(kGYWeight);
167 21600 uint32x4_t v_kBYWeight = vdupq_n_u32(kBYWeight);
168 21600 uint32x4_t y[4];
169
170 KLEIDICV_FORCE_LOOP_UNROLL
171
8/8
✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 21600 times.
✓ Branch 2 taken 5400 times.
✓ Branch 3 taken 21600 times.
✓ Branch 4 taken 5400 times.
✓ Branch 5 taken 21600 times.
✓ Branch 6 taken 5400 times.
✓ Branch 7 taken 21600 times.
108000 for (int i = 0; i < 4; i++) {
172 86400 y[i] = vdupq_n_u32(kHalfShift + kShifted16);
173 86400 y[i] = vmlaq_u32(y[i], r[i], v_kRYWeight);
174 86400 y[i] = vmlaq_u32(y[i], g[i], v_kGYWeight);
175 86400 y[i] = vmlaq_u32(y[i], b[i], v_kBYWeight);
176 86400 }
177
178 43200 return normalize_and_pack_y(y);
179 21600 }
180
181 90640 static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t uv[2]) {
182 90640 const int kHalfShift = (1 << (kWeightScale - 1));
183 90640 const int kShifted128 = (128 << kWeightScale);
184 90640 int uu = kRUWeight * r + kGUWeight * g + kBUWeight * b + kHalfShift +
185 kShifted128;
186 90640 int vv = kBUWeight * r + kGVWeight * g + kBVWeight * b + kHalfShift +
187 kShifted128;
188
189 90640 uv[0] = std::clamp(uu >> kWeightScale, 0, 0xff);
190 90640 uv[1] = std::clamp(vv >> kWeightScale, 0, 0xff);
191 90640 }
192
193 11200 static uint8x16_t compute_u_or_v_2x(const int32x4_t r[4],
194 const int32x4_t g[4],
195 const int32x4_t b[4], const int r_coeff,
196 const int g_coeff, const int b_coeff) {
197 // Constants for U/V calculation
198 11200 const int kHalfShift = (1 << (kWeightScale - 1));
199 11200 const int kShifted128 = (128 << kWeightScale);
200
201 11200 int32x4_t v_r_coeff = vdupq_n_s32(r_coeff);
202 11200 int32x4_t v_g_coeff = vdupq_n_s32(g_coeff);
203 11200 int32x4_t v_b_coeff = vdupq_n_s32(b_coeff);
204 11200 int32x4_t uv[4];
205
206 KLEIDICV_FORCE_LOOP_UNROLL
207
8/8
✓ Branch 0 taken 2800 times.
✓ Branch 1 taken 11200 times.
✓ Branch 2 taken 2800 times.
✓ Branch 3 taken 11200 times.
✓ Branch 4 taken 2800 times.
✓ Branch 5 taken 11200 times.
✓ Branch 6 taken 2800 times.
✓ Branch 7 taken 11200 times.
56000 for (int i = 0; i < 4; i++) {
208 44800 uv[i] = vdupq_n_s32(kHalfShift + kShifted128);
209 44800 uv[i] = vmlaq_s32(uv[i], r[i], v_r_coeff);
210 44800 uv[i] = vmlaq_s32(uv[i], g[i], v_g_coeff);
211 44800 uv[i] = vmlaq_s32(uv[i], b[i], v_b_coeff);
212 44800 }
213
214 22400 return normalize_and_pack_u_or_v(uv);
215 11200 }
216
217 5600 static void rgb_to_uv_2x(const int32x4_t r[4], const int32x4_t g[4],
218 const int32x4_t b[4], uint8x16_t &u, uint8x16_t &v) {
219 // ---------------- U (Cb) Component ----------------
220 // U = R * kRU + G * kGU + B * kBU + bias
221 5600 u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight);
222
223 // ---------------- V (Cr) Component ----------------
224 // V = R * kBU + G * kGV + B * kBV + bias
225 5600 v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight);
226 5600 }
227
228 21600 static uint8x16_t normalize_and_pack_y(uint32x4_t vec[4]) {
229 // The y_index table selects the correct output order after normalization.
230 // When we load and separate the RGB values for UV calculation, we
231 // deinterleave them into even and odd components. As a result, the
232 // processed values are stored in two separate vectors. During
233 // normalization, we need to interleave them again to produce the final
234 // contiguous output, and this index pattern achieves that.
235 21600 uint8x16_t y_index = {1, 17, 3, 19, 5, 21, 7, 23,
236 9, 25, 11, 27, 13, 29, 15, 31};
237
238 // Normalize down by right-shifting the fixed-point result
239 // vshrn_n can only shift by an immediate value between 1 and 16.
240 // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
241 // bits. This ensures that the most relevant 8-bit result lies in the second
242 // byte of each 16-bit element. As a result, the lookup tables are
243 // constructed with only odd indices to extract the second byte from each
244 // element.
245 21600 uint16x4_t tmp_lo_lo = vshrn_n_u32(vec[0], kWeightScale - 8);
246 43200 uint16x8_t tmp_lo_hi =
247 21600 vshrn_high_n_u32(tmp_lo_lo, vec[2], kWeightScale - 8);
248 21600 uint16x4_t tmp_hi_lo = vshrn_n_u32(vec[1], kWeightScale - 8);
249 43200 uint16x8_t tmp_hi_hi =
250 21600 vshrn_high_n_u32(tmp_hi_lo, vec[3], kWeightScale - 8);
251
252 21600 uint8x16x2_t tmp;
253 21600 tmp.val[0] = vreinterpretq_u8(tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14
254 21600 tmp.val[1] = vreinterpretq_u8(tmp_hi_hi); // 1, 3, 5, 7, 9, 11, 13, 15
255
256 21600 uint8x16_t output = vqtbl2q_u8(tmp, y_index);
257
258 43200 return output;
259 21600 }
260
261 11200 static uint8x16_t normalize_and_pack_u_or_v(int32x4_t vec[4]) {
262 // The uv_index table is used to finalize the order of U and V values.
263 // Unlike the Y component, we don't need to interleave even and odd elements
264 // manually. This is because the first vector already contains even-indexed
265 // values from the lower RGB block, and the second vector contains
266 // even-indexed values from the higher RGB block. As a result, the values
267 // are already sorted in the correct order for output.
268 11200 uint8x16_t uv_index = {1, 3, 5, 7, 9, 11, 13, 15,
269 17, 19, 21, 23, 25, 27, 29, 31};
270
271 // Normalize down by right-shifting the fixed-point result
272 // vshrn_n can only shift by an immediate value between 1 and 16.
273 // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
274 // bits. This ensures that the most relevant 8-bit result lies in the second
275 // byte of each 16-bit element. As a result, the lookup tables are
276 // constructed with only odd indices to extract the second byte from each
277 // element.
278 11200 int16x4_t tmp_lo_lo = vshrn_n_s32(vec[0], kWeightScale - 8);
279 11200 int16x8_t tmp_lo_hi = vshrn_high_n_s32(tmp_lo_lo, vec[1], kWeightScale - 8);
280 11200 int16x4_t tmp_hi_lo = vshrn_n_s32(vec[2], kWeightScale - 8);
281 11200 int16x8_t tmp_hi_hi = vshrn_high_n_s32(tmp_hi_lo, vec[3], kWeightScale - 8);
282
283 11200 uint8x16x2_t tmp;
284 11200 tmp.val[0] = vreinterpretq_u8(
285 11200 tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the first vector
286 11200 tmp.val[1] = vreinterpretq_u8(
287 11200 tmp_hi_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the second vector
288 11200 uint8x16_t output = vqtbl2q_u8(tmp, uv_index);
289
290 22400 return output;
291 11200 }
292
293 10800 static void load_rgb_2x(uint32x4_t r0[4], uint32x4_t g0[4], uint32x4_t b0[4],
294 uint32x4_t r1[4], uint32x4_t g1[4], uint32x4_t b1[4],
295 const uint8_t *src_row, const size_t index) {
296 10800 uint8x16_t tmp_b0, tmp_b1, tmp_g0, tmp_g1, tmp_r0, tmp_r1;
297 // Load 32 pixels: two vectors of interleaved channels
298
299 if constexpr (kAlpha) {
300 // 4-channel input (RGBA or BGRA)
301 5400 uint8x16x4_t vsrc0 = vld4q_u8(src_row + scn * index);
302 5400 uint8x16x4_t vsrc1 =
303 5400 vld4q_u8(src_row + scn * index + scn * kVectorLength);
304
305 5400 tmp_b0 = vsrc0.val[b_index_];
306 5400 tmp_g0 = vsrc0.val[g_index_];
307 5400 tmp_r0 = vsrc0.val[r_index_];
308
309 5400 tmp_b1 = vsrc1.val[b_index_];
310 5400 tmp_g1 = vsrc1.val[g_index_];
311 5400 tmp_r1 = vsrc1.val[r_index_];
312 5400 } else {
313 // 3-channel input (RGB or BGR)
314 5400 uint8x16x3_t vsrc0 = vld3q_u8(src_row + scn * index);
315 5400 uint8x16x3_t vsrc1 =
316 5400 vld3q_u8(src_row + scn * index + scn * kVectorLength);
317
318 5400 tmp_b0 = vsrc0.val[b_index_];
319 5400 tmp_g0 = vsrc0.val[g_index_];
320 5400 tmp_r0 = vsrc0.val[r_index_];
321
322 5400 tmp_b1 = vsrc1.val[b_index_];
323 5400 tmp_g1 = vsrc1.val[g_index_];
324 5400 tmp_r1 = vsrc1.val[r_index_];
325 5400 }
326 // After loading the vector, we extend the channels and separate even and
327 // odd elements. This separation is important for UV calculation, as only
328 // the even-indexed values are used.
329 10800 uint8x16_t indices[4] = {
330 0, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 4, 0xff, 0xff,
331 0xff, 6, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 3, 0xff,
332 0xff, 0xff, 5, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 8,
333 0xff, 0xff, 0xff, 10, 0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff,
334 14, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, 11, 0xff, 0xff,
335 0xff, 13, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
336
337 // Expand each 8-bit channel into 32-bit vectors using table lookup and
338 // reinterpret
339 KLEIDICV_FORCE_LOOP_UNROLL
340
8/8
✓ Branch 0 taken 2700 times.
✓ Branch 1 taken 10800 times.
✓ Branch 2 taken 2700 times.
✓ Branch 3 taken 10800 times.
✓ Branch 4 taken 2700 times.
✓ Branch 5 taken 10800 times.
✓ Branch 6 taken 2700 times.
✓ Branch 7 taken 10800 times.
54000 for (int i = 0; i < 4; i++) {
341 43200 r0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r0, indices[i]));
342 43200 g0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g0, indices[i]));
343 43200 b0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b0, indices[i]));
344 43200 r1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r1, indices[i]));
345 43200 g1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g1, indices[i]));
346 43200 b1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b1, indices[i]));
347 43200 }
348 10800 }
349
350 static constexpr size_t r_index_ = RGB ? 0 : 2;
351 static constexpr size_t g_index_ = 1;
352 static constexpr size_t b_index_ = RGB ? 2 : 0;
353 static constexpr size_t scn = kAlpha ? 4 : 3;
354 };
355
356 } // namespace kleidicv::neon
357
358 #endif // KLEIDICV_RGB_TO_YUV420_H
359