KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv420_neon.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 180 180 100.0%
Functions: 104 104 100.0%
Branches: 80 80 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_RGB_TO_YUV420_H
6 #define KLEIDICV_RGB_TO_YUV420_H
7
8 #include <algorithm>
9 #include <utility>
10
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/neon.h"
13 #include "yuv420_coefficients.h"
14
15 namespace kleidicv::neon {
16
17 template <bool kAlpha, bool RGB, bool kInterleave>
18 class RGBxorBGRxToYUV420 {
19 public:
20 376 static kleidicv_error_t rgb2yuv420_operation(
21 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
22 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
23 bool v_first, size_t begin, size_t end) {
24 376 size_t row_begin = begin * 2;
25 376 size_t row_end = std::min<size_t>(height, end * 2);
26
27 376 const uint8_t *src_row = nullptr;
28 376 uint8_t *y_row = nullptr;
29 376 uint8_t *u_row = nullptr;
30 376 uint8_t *v_row = nullptr;
31
8/8
✓ Branch 0 taken 94 times.
✓ Branch 1 taken 4810 times.
✓ Branch 2 taken 94 times.
✓ Branch 3 taken 4810 times.
✓ Branch 4 taken 94 times.
✓ Branch 5 taken 4810 times.
✓ Branch 6 taken 94 times.
✓ Branch 7 taken 4810 times.
19616 for (size_t h = row_begin; h < row_end; h++) {
32 19240 src_row = src + src_stride * h;
33 19240 y_row = y_dst + y_stride * h;
34
35 19240 bool evenRow = (h & 1) == 0;
36
37
8/8
✓ Branch 0 taken 2382 times.
✓ Branch 1 taken 2428 times.
✓ Branch 2 taken 2382 times.
✓ Branch 3 taken 2428 times.
✓ Branch 4 taken 2382 times.
✓ Branch 5 taken 2428 times.
✓ Branch 6 taken 2382 times.
✓ Branch 7 taken 2428 times.
19240 if (evenRow) {
38 if constexpr (kInterleave) {
39 4912 u_row = uv_dst + uv_stride * (h / 2);
40 } else {
41 4800 u_row =
42 4800 uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2);
43 // Pointer to the start of the V plane.
44 // The V plane follows the U plane. Both U and V planes are
45 // subsampled at a 2:1 vertical ratio (i.e., each has height / 2
46 // rows) and stored in a single contiguous chroma region.
47 // Depending on image height and stride, the starting offset
48 // of V may require adjustment, so a
49 // fractional offset (in rows) is applied to calculate the V plane
50 // position.
51 9600 v_row = uv_dst + uv_stride * ((h + height + 1) / 4) +
52 4800 (((h + height + 1) / 2) % 2) * ((width + 1) / 2);
53 }
54 9712 }
55
56 19240 LoopUnroll2<TryToAvoidTailLoop> loop{width, kVectorLength};
57 30040 loop.unroll_twice([&](size_t index) {
58 10800 vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow);
59 10800 });
60
61 77552 loop.tail([&](size_t index) {
62 116624 scalar_path(src_row, y_row, u_row, v_row, v_first, index, width,
63 58312 evenRow);
64 58312 });
65 19240 }
66
67 376 return KLEIDICV_OK;
68 376 }
69
70 private:
71 10800 static void vector_path_2x(const uint8_t *src_row, uint8_t *y_row,
72 uint8_t *u_row, uint8_t *v_row, const bool v_first,
73 const size_t index, const bool evenRow) {
74 10800 uint32x4_t r0[4], g0[4], b0[4], r1[4], g1[4], b1[4];
75
76 10800 load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, index);
77
78 10800 uint8x16_t y0 = rgb_to_y(r0, g0, b0);
79
80 10800 uint8x16_t y1 = rgb_to_y(r1, g1, b1);
81
82 10800 vst1q_u8(y_row + index, y0);
83 10800 vst1q_u8(y_row + index + kVectorLength, y1);
84
85 // U and V are subsampled by a factor of 2 in both horizontal and vertical
86 // directions for YUV420 format. Therefore, we only compute U and V from
87 // even rows and even columns. When the input RGB image has an odd width or
88 // height, the chroma (U and V) dimensions are rounded up. For example, if
89 // the height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2
90 // = 4.5 -> rounded up). The same rounding is applied for width.
91
8/8
✓ Branch 0 taken 1300 times.
✓ Branch 1 taken 1400 times.
✓ Branch 2 taken 1300 times.
✓ Branch 3 taken 1400 times.
✓ Branch 4 taken 1300 times.
✓ Branch 5 taken 1400 times.
✓ Branch 6 taken 1300 times.
✓ Branch 7 taken 1400 times.
10800 if (evenRow) {
92 5600 uint8x16x2_t uv;
93 5600 int32x4_t r_even[4] = {r0[0], r0[2], r1[0], r1[2]};
94 5600 int32x4_t g_even[4] = {g0[0], g0[2], g1[0], g1[2]};
95 5600 int32x4_t b_even[4] = {b0[0], b0[2], b1[0], b1[2]};
96 5600 rgb_to_uv_2x(r_even, g_even, b_even, uv.val[0], uv.val[1]);
97
98
8/8
✓ Branch 0 taken 700 times.
✓ Branch 1 taken 700 times.
✓ Branch 2 taken 700 times.
✓ Branch 3 taken 700 times.
✓ Branch 4 taken 700 times.
✓ Branch 5 taken 700 times.
✓ Branch 6 taken 700 times.
✓ Branch 7 taken 700 times.
5600 if (v_first) {
99 2800 std::swap(uv.val[0], uv.val[1]);
100 2800 }
101
102 if constexpr (kInterleave) {
103 2800 vst2q_u8(u_row + index, uv);
104 } else {
105 2800 vst1q_u8(u_row + index / 2, uv.val[0]);
106 2800 vst1q_u8(v_row + index / 2, uv.val[1]);
107 }
108 5600 }
109 10800 }
110
111 58312 static void scalar_path(const uint8_t *src_row, uint8_t *y_row,
112 uint8_t *u_row, uint8_t *v_row, const bool v_first,
113 size_t index, const size_t length,
114 const bool evenRow) {
115 58312 const size_t u_index_ = v_first;
116 58312 const size_t v_index_ = !v_first;
117
118
8/8
✓ Branch 0 taken 65246 times.
✓ Branch 1 taken 14578 times.
✓ Branch 2 taken 65246 times.
✓ Branch 3 taken 14578 times.
✓ Branch 4 taken 65246 times.
✓ Branch 5 taken 14578 times.
✓ Branch 6 taken 65246 times.
✓ Branch 7 taken 14578 times.
319296 for (; index < length; index += 1) {
119 260984 uint8_t b0{}, g0{}, r0{};
120 260984 bool evenCol = (index & 1) == 0;
121 260984 b0 = src_row[index * scn + b_index_];
122 260984 g0 = src_row[index * scn + g_index_];
123 260984 r0 = src_row[index * scn + r_index_];
124
125 260984 uint8_t y0 = rgb_to_y(r0, g0, b0);
126 260984 y_row[index] = y0;
127
128 // U and V are subsampled by a factor of 2 in both horizontal and vertical
129 // directions
130 // for YUV420 format. Therefore, we only compute U and V from even rows
131 // and even columns. When the input RGB image has an odd width or height,
132 // the chroma (U and V) dimensions are rounded up. For example, if the
133 // height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 = 4.5
134 // -> rounded up). The same rounding is applied for width.
135
16/16
✓ Branch 0 taken 33384 times.
✓ Branch 1 taken 31862 times.
✓ Branch 2 taken 18092 times.
✓ Branch 3 taken 15292 times.
✓ Branch 4 taken 33384 times.
✓ Branch 5 taken 31862 times.
✓ Branch 6 taken 18092 times.
✓ Branch 7 taken 15292 times.
✓ Branch 8 taken 33384 times.
✓ Branch 9 taken 31862 times.
✓ Branch 10 taken 18092 times.
✓ Branch 11 taken 15292 times.
✓ Branch 12 taken 33384 times.
✓ Branch 13 taken 31862 times.
✓ Branch 14 taken 18092 times.
✓ Branch 15 taken 15292 times.
260984 if (evenRow && evenCol) {
136 61168 uint8_t uv[2] = {0, 0};
137 61168 rgb_to_uv(r0, g0, b0, uv);
138 if constexpr (kInterleave) {
139 37248 u_row[index] = uv[u_index_];
140 37248 u_row[index + 1] = uv[v_index_];
141 } else {
142 23920 u_row[(index + 1) / 2] = uv[u_index_];
143 23920 v_row[(index + 1) / 2] = uv[v_index_];
144 }
145 61168 }
146 260984 }
147 58312 }
148
149 260984 static uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) {
150 260984 const int kShifted16 = (16 << kWeightScale);
151 260984 const int kHalfShift = (1 << (kWeightScale - 1));
152 521968 int yy =
153 260984 kRYWeight * r + kGYWeight * g + kBYWeight * b + kHalfShift + kShifted16;
154
155 521968 return std::clamp(yy >> kWeightScale, 0, 0xff);
156 260984 }
157
158 21600 static uint8x16_t rgb_to_y(const uint32x4_t r[4], const uint32x4_t g[4],
159 const uint32x4_t b[4]) {
160 21600 const int kShifted16 = (16 << kWeightScale);
161 21600 const int kHalfShift = (1 << (kWeightScale - 1));
162
163 // Y = kR*R + kG*G + kB*B + rounding bias
164 21600 uint32x4_t v_kRYWeight = vdupq_n_u32(kRYWeight);
165 21600 uint32x4_t v_kGYWeight = vdupq_n_u32(kGYWeight);
166 21600 uint32x4_t v_kBYWeight = vdupq_n_u32(kBYWeight);
167 21600 uint32x4_t y[4];
168
169 KLEIDICV_FORCE_LOOP_UNROLL
170
8/8
✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 21600 times.
✓ Branch 2 taken 5400 times.
✓ Branch 3 taken 21600 times.
✓ Branch 4 taken 5400 times.
✓ Branch 5 taken 21600 times.
✓ Branch 6 taken 5400 times.
✓ Branch 7 taken 21600 times.
108000 for (int i = 0; i < 4; i++) {
171 86400 y[i] = vdupq_n_u32(kHalfShift + kShifted16);
172 86400 y[i] = vmlaq_u32(y[i], r[i], v_kRYWeight);
173 86400 y[i] = vmlaq_u32(y[i], g[i], v_kGYWeight);
174 86400 y[i] = vmlaq_u32(y[i], b[i], v_kBYWeight);
175 86400 }
176
177 43200 return normalize_and_pack_y(y);
178 21600 }
179
180 61168 static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t uv[2]) {
181 61168 const int kHalfShift = (1 << (kWeightScale - 1));
182 61168 const int kShifted128 = (128 << kWeightScale);
183 61168 int uu = kRUWeight * r + kGUWeight * g + kBUWeight * b + kHalfShift +
184 kShifted128;
185 61168 int vv = kBUWeight * r + kGVWeight * g + kBVWeight * b + kHalfShift +
186 kShifted128;
187
188 61168 uv[0] = std::clamp(uu >> kWeightScale, 0, 0xff);
189 61168 uv[1] = std::clamp(vv >> kWeightScale, 0, 0xff);
190 61168 }
191
192 11200 static uint8x16_t compute_u_or_v_2x(const int32x4_t r[4],
193 const int32x4_t g[4],
194 const int32x4_t b[4], const int r_coeff,
195 const int g_coeff, const int b_coeff) {
196 // Constants for U/V calculation
197 11200 const int kHalfShift = (1 << (kWeightScale - 1));
198 11200 const int kShifted128 = (128 << kWeightScale);
199
200 11200 int32x4_t v_r_coeff = vdupq_n_s32(r_coeff);
201 11200 int32x4_t v_g_coeff = vdupq_n_s32(g_coeff);
202 11200 int32x4_t v_b_coeff = vdupq_n_s32(b_coeff);
203 11200 int32x4_t uv[4];
204
205 KLEIDICV_FORCE_LOOP_UNROLL
206
8/8
✓ Branch 0 taken 2800 times.
✓ Branch 1 taken 11200 times.
✓ Branch 2 taken 2800 times.
✓ Branch 3 taken 11200 times.
✓ Branch 4 taken 2800 times.
✓ Branch 5 taken 11200 times.
✓ Branch 6 taken 2800 times.
✓ Branch 7 taken 11200 times.
56000 for (int i = 0; i < 4; i++) {
207 44800 uv[i] = vdupq_n_s32(kHalfShift + kShifted128);
208 44800 uv[i] = vmlaq_s32(uv[i], r[i], v_r_coeff);
209 44800 uv[i] = vmlaq_s32(uv[i], g[i], v_g_coeff);
210 44800 uv[i] = vmlaq_s32(uv[i], b[i], v_b_coeff);
211 44800 }
212
213 22400 return normalize_and_pack_u_or_v(uv);
214 11200 }
215
216 5600 static void rgb_to_uv_2x(const int32x4_t r[4], const int32x4_t g[4],
217 const int32x4_t b[4], uint8x16_t &u, uint8x16_t &v) {
218 // ---------------- U (Cb) Component ----------------
219 // U = R * kRU + G * kGU + B * kBU + bias
220 5600 u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight);
221
222 // ---------------- V (Cr) Component ----------------
223 // V = R * kBU + G * kGV + B * kBV + bias
224 5600 v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight);
225 5600 }
226
227 21600 static uint8x16_t normalize_and_pack_y(uint32x4_t vec[4]) {
228 // The y_index table selects the correct output order after normalization.
229 // When we load and separate the RGB values for UV calculation, we
230 // deinterleave them into even and odd components. As a result, the
231 // processed values are stored in two separate vectors. During
232 // normalization, we need to interleave them again to produce the final
233 // contiguous output, and this index pattern achieves that.
234 21600 uint8x16_t y_index = {1, 17, 3, 19, 5, 21, 7, 23,
235 9, 25, 11, 27, 13, 29, 15, 31};
236
237 // Normalize down by right-shifting the fixed-point result
238 // vshrn_n can only shift by an immediate value between 1 and 16.
239 // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
240 // bits. This ensures that the most relevant 8-bit result lies in the second
241 // byte of each 16-bit element. As a result, the lookup tables are
242 // constructed with only odd indices to extract the second byte from each
243 // element.
244 21600 uint16x4_t tmp_lo_lo = vshrn_n_u32(vec[0], kWeightScale - 8);
245 43200 uint16x8_t tmp_lo_hi =
246 21600 vshrn_high_n_u32(tmp_lo_lo, vec[2], kWeightScale - 8);
247 21600 uint16x4_t tmp_hi_lo = vshrn_n_u32(vec[1], kWeightScale - 8);
248 43200 uint16x8_t tmp_hi_hi =
249 21600 vshrn_high_n_u32(tmp_hi_lo, vec[3], kWeightScale - 8);
250
251 21600 uint8x16x2_t tmp;
252 21600 tmp.val[0] = vreinterpretq_u8(tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14
253 21600 tmp.val[1] = vreinterpretq_u8(tmp_hi_hi); // 1, 3, 5, 7, 9, 11, 13, 15
254
255 21600 uint8x16_t output = vqtbl2q_u8(tmp, y_index);
256
257 43200 return output;
258 21600 }
259
260 11200 static uint8x16_t normalize_and_pack_u_or_v(int32x4_t vec[4]) {
261 // The uv_index table is used to finalize the order of U and V values.
262 // Unlike the Y component, we don't need to interleave even and odd elements
263 // manually. This is because the first vector already contains even-indexed
264 // values from the lower RGB block, and the second vector contains
265 // even-indexed values from the higher RGB block. As a result, the values
266 // are already sorted in the correct order for output.
267 11200 uint8x16_t uv_index = {1, 3, 5, 7, 9, 11, 13, 15,
268 17, 19, 21, 23, 25, 27, 29, 31};
269
270 // Normalize down by right-shifting the fixed-point result
271 // vshrn_n can only shift by an immediate value between 1 and 16.
272 // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
273 // bits. This ensures that the most relevant 8-bit result lies in the second
274 // byte of each 16-bit element. As a result, the lookup tables are
275 // constructed with only odd indices to extract the second byte from each
276 // element.
277 11200 int16x4_t tmp_lo_lo = vshrn_n_s32(vec[0], kWeightScale - 8);
278 11200 int16x8_t tmp_lo_hi = vshrn_high_n_s32(tmp_lo_lo, vec[1], kWeightScale - 8);
279 11200 int16x4_t tmp_hi_lo = vshrn_n_s32(vec[2], kWeightScale - 8);
280 11200 int16x8_t tmp_hi_hi = vshrn_high_n_s32(tmp_hi_lo, vec[3], kWeightScale - 8);
281
282 11200 uint8x16x2_t tmp;
283 11200 tmp.val[0] = vreinterpretq_u8(
284 11200 tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the first vector
285 11200 tmp.val[1] = vreinterpretq_u8(
286 11200 tmp_hi_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the second vector
287 11200 uint8x16_t output = vqtbl2q_u8(tmp, uv_index);
288
289 22400 return output;
290 11200 }
291
292 10800 static void load_rgb_2x(uint32x4_t r0[4], uint32x4_t g0[4], uint32x4_t b0[4],
293 uint32x4_t r1[4], uint32x4_t g1[4], uint32x4_t b1[4],
294 const uint8_t *src_row, const size_t index) {
295 10800 uint8x16_t tmp_b0, tmp_b1, tmp_g0, tmp_g1, tmp_r0, tmp_r1;
296 // Load 32 pixels: two vectors of interleaved channels
297
298 if constexpr (kAlpha) {
299 // 4-channel input (RGBA or BGRA)
300 5400 uint8x16x4_t vsrc0 = vld4q_u8(src_row + scn * index);
301 5400 uint8x16x4_t vsrc1 =
302 5400 vld4q_u8(src_row + scn * index + scn * kVectorLength);
303
304 5400 tmp_b0 = vsrc0.val[b_index_];
305 5400 tmp_g0 = vsrc0.val[g_index_];
306 5400 tmp_r0 = vsrc0.val[r_index_];
307
308 5400 tmp_b1 = vsrc1.val[b_index_];
309 5400 tmp_g1 = vsrc1.val[g_index_];
310 5400 tmp_r1 = vsrc1.val[r_index_];
311 5400 } else {
312 // 3-channel input (RGB or BGR)
313 5400 uint8x16x3_t vsrc0 = vld3q_u8(src_row + scn * index);
314 5400 uint8x16x3_t vsrc1 =
315 5400 vld3q_u8(src_row + scn * index + scn * kVectorLength);
316
317 5400 tmp_b0 = vsrc0.val[b_index_];
318 5400 tmp_g0 = vsrc0.val[g_index_];
319 5400 tmp_r0 = vsrc0.val[r_index_];
320
321 5400 tmp_b1 = vsrc1.val[b_index_];
322 5400 tmp_g1 = vsrc1.val[g_index_];
323 5400 tmp_r1 = vsrc1.val[r_index_];
324 5400 }
325 // After loading the vector, we extend the channels and separate even and
326 // odd elements. This separation is important for UV calculation, as only
327 // the even-indexed values are used.
328 10800 uint8x16_t indices[4] = {
329 0, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 4, 0xff, 0xff,
330 0xff, 6, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 3, 0xff,
331 0xff, 0xff, 5, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 8,
332 0xff, 0xff, 0xff, 10, 0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff,
333 14, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, 11, 0xff, 0xff,
334 0xff, 13, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
335
336 // Expand each 8-bit channel into 32-bit vectors using table lookup and
337 // reinterpret
338 KLEIDICV_FORCE_LOOP_UNROLL
339
8/8
✓ Branch 0 taken 2700 times.
✓ Branch 1 taken 10800 times.
✓ Branch 2 taken 2700 times.
✓ Branch 3 taken 10800 times.
✓ Branch 4 taken 2700 times.
✓ Branch 5 taken 10800 times.
✓ Branch 6 taken 2700 times.
✓ Branch 7 taken 10800 times.
54000 for (int i = 0; i < 4; i++) {
340 43200 r0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r0, indices[i]));
341 43200 g0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g0, indices[i]));
342 43200 b0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b0, indices[i]));
343 43200 r1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r1, indices[i]));
344 43200 g1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g1, indices[i]));
345 43200 b1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b1, indices[i]));
346 43200 }
347 10800 }
348
349 static constexpr size_t r_index_ = RGB ? 0 : 2;
350 static constexpr size_t g_index_ = 1;
351 static constexpr size_t b_index_ = RGB ? 2 : 0;
352 static constexpr size_t scn = kAlpha ? 4 : 3;
353 };
354
355 } // namespace kleidicv::neon
356
357 #endif // KLEIDICV_RGB_TO_YUV420_H
358