KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/rgb_to_yuv_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 122 122 100.0%
Functions: 24 24 100.0%
Branches: 56 56 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/conversions/rgb_to_yuv.h"
6 #include "kleidicv/kleidicv.h"
7 #include "kleidicv/neon.h"
8
9 namespace kleidicv::neon {
10
11 template <bool BGR, bool kAlpha>
12 class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
13 public:
14 using VecTraits = neon::VecTraits<uint8_t>;
15 using ScalarType = VecTraits::ScalarType;
16 using VectorType = VecTraits::VectorType;
17 using RawSourceVectorType =
18 typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
19
20 explicit RGBToYUVAll() = default;
21
22 // Returns the number of channels in the input image.
23 292 static constexpr size_t input_channels() {
24 292 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
25 }
26
27 1880 void vector_path(const ScalarType *src, ScalarType *dst) {
28 1880 RawSourceVectorType vsrc;
29 1880 int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
30 if constexpr (kAlpha) {
31 940 VecTraits::load(src, vsrc);
32
33 940 uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
34 940 uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
35 if constexpr (BGR) {
36 470 b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
37 470 b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
38 470 r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
39 470 r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
40 } else {
41 470 r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
42 470 r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
43 470 b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
44 470 b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
45 }
46 940 uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]);
47 940 g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0)));
48 940 uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]);
49 940 g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0)));
50 940 } else {
51 // Load deinterleaved
52 940 vsrc = vld3q_u8(src);
53 940 r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
54 940 r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
55 940 g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
56 940 g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
57 940 b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
58 940 b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
59 }
60 // Compute Y value in 32-bit precision
61 1880 int16x8_t y_l, y_h;
62 {
63 1880 int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight);
64 1880 int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight);
65 1880 int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight);
66 1880 int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight);
67
68 1880 y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight);
69 1880 y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight);
70 1880 y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight);
71 1880 y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight);
72
73 1880 y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight);
74 1880 y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight);
75 1880 y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight);
76 1880 y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight);
77
78 1880 y_l = combine_scaled_s16(y_ll, y_lh);
79 1880 y_h = combine_scaled_s16(y_hl, y_hh);
80 1880 }
81
82 // Using the 16-bit Y value, calculate U
83 1880 int16x8_t u_l, u_h;
84 {
85 1880 int16x8_t uy_l = vqsubq(b_l, y_l);
86 1880 int16x8_t uy_h = vqsubq(b_h, y_h);
87
88 1880 int32x4_t u_ll = vdupq_n_s32(half_);
89 1880 int32x4_t u_lh = u_ll;
90 1880 int32x4_t u_hl = u_ll;
91 1880 int32x4_t u_hh = u_ll;
92
93 1880 u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight);
94 1880 u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight);
95 1880 u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight);
96 1880 u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight);
97
98 1880 u_l = combine_scaled_s16(u_ll, u_lh);
99 1880 u_h = combine_scaled_s16(u_hl, u_hh);
100 1880 }
101
102 // Using the 16-bit Y value, calculate V
103 1880 int16x8_t v_l, v_h;
104 {
105 1880 int16x8_t vy_l = vqsubq(r_l, y_l);
106 1880 int16x8_t vy_h = vqsubq(r_h, y_h);
107
108 1880 int32x4_t v_ll = vdupq_n_s32(half_);
109 1880 int32x4_t v_lh = v_ll;
110 1880 int32x4_t v_hl = v_ll;
111 1880 int32x4_t v_hh = v_ll;
112
113 1880 v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight);
114 1880 v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight);
115 1880 v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight);
116 1880 v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight);
117
118 1880 v_l = combine_scaled_s16(v_ll, v_lh);
119 1880 v_h = combine_scaled_s16(v_hl, v_hh);
120 1880 }
121
122 // Narrow the results to 8 bits
123 1880 uint8x16x3_t yuv;
124 1880 yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h));
125 1880 yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h));
126 1880 yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h));
127
128 // Store interleaved YUV pixels to memory.
129 1880 vst3q_u8(dst, yuv);
130 1880 }
131
132 412 void scalar_path(const ScalarType *src, ScalarType *dst) {
133 824 int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight +
134 412 src[b_index_] * kBYWeight;
135 412 y = rounding_shift_right(y, kWeightScale);
136 412 int32_t u = (src[b_index_] - y) * kBUWeight + half_;
137 412 u = rounding_shift_right(u, kWeightScale);
138 412 int32_t v = (src[r_index_] - y) * kRVWeight + half_;
139 412 v = rounding_shift_right(v, kWeightScale);
140 412 dst[0] = saturating_cast<int32_t, uint8_t>(y);
141 412 dst[1] = saturating_cast<int32_t, uint8_t>(u);
142 412 dst[2] = saturating_cast<int32_t, uint8_t>(v);
143 412 }
144
145 private:
146 static constexpr size_t r_index_ = BGR ? 2 : 0;
147 static constexpr size_t g_index_ = 1;
148 static constexpr size_t b_index_ = BGR ? 0 : 2;
149 static constexpr size_t step_ = kAlpha ? 4 : 3;
150 static constexpr uint32_t half_ =
151 (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
152
153 11280 static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
154 11280 return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale);
155 }
156 }; // end of class RGBToYUVAll<bool BGR, bool kAlpha>
157
158 template <typename OperationType, typename ScalarType>
159 356 kleidicv_error_t rgb2yuv_operation(OperationType &operation,
160 const ScalarType *src, size_t src_stride,
161 ScalarType *dst, size_t dst_stride,
162 size_t width, size_t height) {
163
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 85 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 85 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 85 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 85 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 85 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 85 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 85 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 85 times.
356 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
164
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 81 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 81 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 81 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 81 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 81 times.
340 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
165
24/24
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 73 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 73 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 73 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 73 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 77 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 73 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 73 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 77 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 73 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 73 times.
324 CHECK_IMAGE_SIZE(width, height);
166
167 292 Rectangle rect{width, height};
168 292 Rows src_rows{src, src_stride, operation.input_channels()};
169 292 Rows dst_rows{dst, dst_stride, 3};
170
171 292 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
172 292 return KLEIDICV_OK;
173 356 }
174
175 using RGBToYUV = RGBToYUVAll<false, false>;
176 using RGBAToYUV = RGBToYUVAll<false, true>;
177 using BGRToYUV = RGBToYUVAll<true, false>;
178 using BGRAToYUV = RGBToYUVAll<true, true>;
179
180 KLEIDICV_TARGET_FN_ATTRS
181 89 kleidicv_error_t rgb_to_yuv_u8(const uint8_t *src, size_t src_stride,
182 uint8_t *dst, size_t dst_stride, size_t width,
183 size_t height) {
184 89 RGBToYUV operation;
185 267 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
186 89 height);
187 89 }
188
189 KLEIDICV_TARGET_FN_ATTRS
190 89 kleidicv_error_t rgba_to_yuv_u8(const uint8_t *src, size_t src_stride,
191 uint8_t *dst, size_t dst_stride, size_t width,
192 size_t height) {
193 89 RGBAToYUV operation;
194 267 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
195 89 height);
196 89 }
197
198 KLEIDICV_TARGET_FN_ATTRS
199 89 kleidicv_error_t bgr_to_yuv_u8(const uint8_t *src, size_t src_stride,
200 uint8_t *dst, size_t dst_stride, size_t width,
201 size_t height) {
202 89 BGRToYUV operation;
203 267 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
204 89 height);
205 89 }
206
207 KLEIDICV_TARGET_FN_ATTRS
208 89 kleidicv_error_t bgra_to_yuv_u8(const uint8_t *src, size_t src_stride,
209 uint8_t *dst, size_t dst_stride, size_t width,
210 size_t height) {
211 89 BGRAToYUV operation;
212 267 return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
213 89 height);
214 89 }
215
216 } // namespace kleidicv::neon
217