KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv444_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 122 122 100.0%
Functions: 21 21 100.0%
Branches: 61 61 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/conversions/rgb_to_yuv.h"
6 #include "kleidicv/kleidicv.h"
7 #include "kleidicv/neon.h"
8 #include "rgb_to_yuv444_coefficients.h"
9 namespace kleidicv::neon {
10
11 template <bool BGR, bool kAlpha>
12 class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
13 public:
14 using VecTraits = neon::VecTraits<uint8_t>;
15 using ScalarType = VecTraits::ScalarType;
16 using VectorType = VecTraits::VectorType;
17 using RawSourceVectorType =
18 typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
19
20 explicit RGBToYUVAll() = default;
21
22 // Returns the number of channels in the input image.
23 260 static constexpr size_t input_channels() {
24 260 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
25 }
26
27 KLEIDICV_FORCE_INLINE
28 1904 void vector_path(const ScalarType *src, ScalarType *dst) {
29 1904 RawSourceVectorType vsrc;
30 1904 int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
31 if constexpr (kAlpha) {
32 952 VecTraits::load(src, vsrc);
33
34 952 uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
35 952 uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
36 if constexpr (BGR) {
37 476 b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
38 476 b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
39 476 r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
40 476 r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
41 } else {
42 476 r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
43 476 r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
44 476 b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
45 476 b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
46 }
47 952 uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]);
48 952 g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0)));
49 952 uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]);
50 952 g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0)));
51 952 } else {
52 // Load deinterleaved
53 952 vsrc = vld3q_u8(src);
54 952 r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
55 952 r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
56 952 g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
57 952 g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
58 952 b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
59 952 b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
60 }
61 // Compute Y value in 32-bit precision
62 1904 int16x8_t y_l, y_h;
63 {
64 1904 int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight);
65 1904 int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight);
66 1904 int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight);
67 1904 int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight);
68
69 1904 y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight);
70 1904 y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight);
71 1904 y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight);
72 1904 y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight);
73
74 1904 y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight);
75 1904 y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight);
76 1904 y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight);
77 1904 y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight);
78
79 1904 y_l = combine_scaled_s16(y_ll, y_lh);
80 1904 y_h = combine_scaled_s16(y_hl, y_hh);
81 1904 }
82
83 // Using the 16-bit Y value, calculate U
84 1904 int16x8_t u_l, u_h;
85 {
86 1904 int16x8_t uy_l = vqsubq(b_l, y_l);
87 1904 int16x8_t uy_h = vqsubq(b_h, y_h);
88
89 1904 int32x4_t u_ll = vdupq_n_s32(half_);
90 1904 int32x4_t u_lh = u_ll;
91 1904 int32x4_t u_hl = u_ll;
92 1904 int32x4_t u_hh = u_ll;
93
94 1904 u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight);
95 1904 u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight);
96 1904 u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight);
97 1904 u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight);
98
99 1904 u_l = combine_scaled_s16(u_ll, u_lh);
100 1904 u_h = combine_scaled_s16(u_hl, u_hh);
101 1904 }
102
103 // Using the 16-bit Y value, calculate V
104 1904 int16x8_t v_l, v_h;
105 {
106 1904 int16x8_t vy_l = vqsubq(r_l, y_l);
107 1904 int16x8_t vy_h = vqsubq(r_h, y_h);
108
109 1904 int32x4_t v_ll = vdupq_n_s32(half_);
110 1904 int32x4_t v_lh = v_ll;
111 1904 int32x4_t v_hl = v_ll;
112 1904 int32x4_t v_hh = v_ll;
113
114 1904 v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight);
115 1904 v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight);
116 1904 v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight);
117 1904 v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight);
118
119 1904 v_l = combine_scaled_s16(v_ll, v_lh);
120 1904 v_h = combine_scaled_s16(v_hl, v_hh);
121 1904 }
122
123 // Narrow the results to 8 bits
124 1904 uint8x16x3_t yuv;
125 1904 yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h));
126 1904 yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h));
127 1904 yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h));
128
129 // Store interleaved YUV pixels to memory.
130 1904 vst3q_u8(dst, yuv);
131 1904 }
132
133 492 void scalar_path(const ScalarType *src, ScalarType *dst) {
134 984 int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight +
135 492 src[b_index_] * kBYWeight;
136 492 y = rounding_shift_right(y, kWeightScale);
137 492 int32_t u = (src[b_index_] - y) * kBUWeight + half_;
138 492 u = rounding_shift_right(u, kWeightScale);
139 492 int32_t v = (src[r_index_] - y) * kRVWeight + half_;
140 492 v = rounding_shift_right(v, kWeightScale);
141 492 dst[0] = saturating_cast<int32_t, uint8_t>(y);
142 492 dst[1] = saturating_cast<int32_t, uint8_t>(u);
143 492 dst[2] = saturating_cast<int32_t, uint8_t>(v);
144 492 }
145
146 private:
147 static constexpr size_t r_index_ = BGR ? 2 : 0;
148 static constexpr size_t g_index_ = 1;
149 static constexpr size_t b_index_ = BGR ? 0 : 2;
150 static constexpr size_t step_ = kAlpha ? 4 : 3;
151 static constexpr uint32_t half_ =
152 (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
153
154 KLEIDICV_FORCE_INLINE
155 11424 static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
156 11424 return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale);
157 }
158 }; // end of class RGBToYUVAll<bool BGR, bool kAlpha>
159
160 template <typename OperationType, typename ScalarType>
161 336 kleidicv_error_t rgb2yuv_operation(OperationType &operation,
162 const ScalarType *src, size_t src_stride,
163 ScalarType *dst, size_t dst_stride,
164 size_t width, size_t height) {
165
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 80 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 80 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 80 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 80 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 80 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 80 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 80 times.
336 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
166
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 76 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 76 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 76 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 76 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 76 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 76 times.
320 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
167
24/24
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 65 times.
✓ Branch 4 taken 11 times.
✓ Branch 5 taken 65 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 70 times.
✓ Branch 8 taken 5 times.
✓ Branch 9 taken 65 times.
✓ Branch 10 taken 11 times.
✓ Branch 11 taken 65 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 70 times.
✓ Branch 14 taken 5 times.
✓ Branch 15 taken 65 times.
✓ Branch 16 taken 11 times.
✓ Branch 17 taken 65 times.
✓ Branch 18 taken 6 times.
✓ Branch 19 taken 70 times.
✓ Branch 20 taken 5 times.
✓ Branch 21 taken 65 times.
✓ Branch 22 taken 11 times.
✓ Branch 23 taken 65 times.
304 CHECK_IMAGE_SIZE(width, height);
168
169 260 Rectangle rect{width, height};
170 260 Rows src_rows{src, src_stride, operation.input_channels()};
171 260 Rows dst_rows{dst, dst_stride, 3};
172
173 260 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
174 260 return KLEIDICV_OK;
175 336 }
176
177 using RGBToYUV = RGBToYUVAll<false, false>;
178 using RGBAToYUV = RGBToYUVAll<false, true>;
179 using BGRToYUV = RGBToYUVAll<true, false>;
180 using BGRAToYUV = RGBToYUVAll<true, true>;
181
182 KLEIDICV_TARGET_FN_ATTRS
183 360 kleidicv_error_t rgb_to_yuv444_u8(const uint8_t *src, size_t src_stride,
184 uint8_t *dst, size_t dst_stride, size_t width,
185 size_t height,
186 kleidicv_color_conversion_t color_format) {
187
5/5
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 84 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
360 switch (color_format) {
188 case KLEIDICV_RGB_TO_YUV444: {
189 84 RGBToYUV operation;
190 168 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
191 84 width, height);
192 84 }
193
194 case KLEIDICV_BGR_TO_YUV444: {
195 84 BGRToYUV operation;
196 168 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
197 84 width, height);
198 84 }
199
200 case KLEIDICV_RGBA_TO_YUV444: {
201 84 RGBAToYUV operation;
202 168 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
203 84 width, height);
204 84 }
205
206 case KLEIDICV_BGRA_TO_YUV444: {
207 84 BGRAToYUV operation;
208 168 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
209 84 width, height);
210 84 }
211
212 default:
213 24 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
214 }
215
216 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
217 360 }
218
219 } // namespace kleidicv::neon
220