Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include "kleidicv/conversions/rgb_to_yuv.h" | ||
6 | #include "kleidicv/kleidicv.h" | ||
7 | #include "kleidicv/neon.h" | ||
8 | |||
9 | namespace kleidicv::neon { | ||
10 | |||
11 | template <bool BGR, bool kAlpha> | ||
12 | class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop { | ||
13 | public: | ||
14 | using VecTraits = neon::VecTraits<uint8_t>; | ||
15 | using ScalarType = VecTraits::ScalarType; | ||
16 | using VectorType = VecTraits::VectorType; | ||
17 | using RawSourceVectorType = | ||
18 | typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type; | ||
19 | |||
20 | explicit RGBToYUVAll() = default; | ||
21 | |||
22 | // Returns the number of channels in the input image. | ||
23 | 292 | static constexpr size_t input_channels() { | |
24 | 292 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
25 | } | ||
26 | |||
27 | 1880 | void vector_path(const ScalarType *src, ScalarType *dst) { | |
28 | 1880 | RawSourceVectorType vsrc; | |
29 | 1880 | int16x8_t r_l, r_h, g_l, g_h, b_l, b_h; | |
30 | if constexpr (kAlpha) { | ||
31 | 940 | VecTraits::load(src, vsrc); | |
32 | |||
33 | 940 | uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]); | |
34 | 940 | uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]); | |
35 | if constexpr (BGR) { | ||
36 | 470 | b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0))); | |
37 | 470 | b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0))); | |
38 | 470 | r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0))); | |
39 | 470 | r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0))); | |
40 | } else { | ||
41 | 470 | r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0))); | |
42 | 470 | r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0))); | |
43 | 470 | b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0))); | |
44 | 470 | b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0))); | |
45 | } | ||
46 | 940 | uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]); | |
47 | 940 | g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0))); | |
48 | 940 | uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]); | |
49 | 940 | g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0))); | |
50 | 940 | } else { | |
51 | // Load deinterleaved | ||
52 | 940 | vsrc = vld3q_u8(src); | |
53 | 940 | r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0))); | |
54 | 940 | r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0))); | |
55 | 940 | g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0))); | |
56 | 940 | g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0))); | |
57 | 940 | b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0))); | |
58 | 940 | b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0))); | |
59 | } | ||
60 | // Compute Y value in 32-bit precision | ||
61 | 1880 | int16x8_t y_l, y_h; | |
62 | { | ||
63 | 1880 | int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight); | |
64 | 1880 | int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight); | |
65 | 1880 | int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight); | |
66 | 1880 | int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight); | |
67 | |||
68 | 1880 | y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight); | |
69 | 1880 | y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight); | |
70 | 1880 | y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight); | |
71 | 1880 | y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight); | |
72 | |||
73 | 1880 | y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight); | |
74 | 1880 | y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight); | |
75 | 1880 | y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight); | |
76 | 1880 | y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight); | |
77 | |||
78 | 1880 | y_l = combine_scaled_s16(y_ll, y_lh); | |
79 | 1880 | y_h = combine_scaled_s16(y_hl, y_hh); | |
80 | 1880 | } | |
81 | |||
82 | // Using the 16-bit Y value, calculate U | ||
83 | 1880 | int16x8_t u_l, u_h; | |
84 | { | ||
85 | 1880 | int16x8_t uy_l = vqsubq(b_l, y_l); | |
86 | 1880 | int16x8_t uy_h = vqsubq(b_h, y_h); | |
87 | |||
88 | 1880 | int32x4_t u_ll = vdupq_n_s32(half_); | |
89 | 1880 | int32x4_t u_lh = u_ll; | |
90 | 1880 | int32x4_t u_hl = u_ll; | |
91 | 1880 | int32x4_t u_hh = u_ll; | |
92 | |||
93 | 1880 | u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight); | |
94 | 1880 | u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight); | |
95 | 1880 | u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight); | |
96 | 1880 | u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight); | |
97 | |||
98 | 1880 | u_l = combine_scaled_s16(u_ll, u_lh); | |
99 | 1880 | u_h = combine_scaled_s16(u_hl, u_hh); | |
100 | 1880 | } | |
101 | |||
102 | // Using the 16-bit Y value, calculate V | ||
103 | 1880 | int16x8_t v_l, v_h; | |
104 | { | ||
105 | 1880 | int16x8_t vy_l = vqsubq(r_l, y_l); | |
106 | 1880 | int16x8_t vy_h = vqsubq(r_h, y_h); | |
107 | |||
108 | 1880 | int32x4_t v_ll = vdupq_n_s32(half_); | |
109 | 1880 | int32x4_t v_lh = v_ll; | |
110 | 1880 | int32x4_t v_hl = v_ll; | |
111 | 1880 | int32x4_t v_hh = v_ll; | |
112 | |||
113 | 1880 | v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight); | |
114 | 1880 | v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight); | |
115 | 1880 | v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight); | |
116 | 1880 | v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight); | |
117 | |||
118 | 1880 | v_l = combine_scaled_s16(v_ll, v_lh); | |
119 | 1880 | v_h = combine_scaled_s16(v_hl, v_hh); | |
120 | 1880 | } | |
121 | |||
122 | // Narrow the results to 8 bits | ||
123 | 1880 | uint8x16x3_t yuv; | |
124 | 1880 | yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h)); | |
125 | 1880 | yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h)); | |
126 | 1880 | yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h)); | |
127 | |||
128 | // Store interleaved YUV pixels to memory. | ||
129 | 1880 | vst3q_u8(dst, yuv); | |
130 | 1880 | } | |
131 | |||
132 | 412 | void scalar_path(const ScalarType *src, ScalarType *dst) { | |
133 | 824 | int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight + | |
134 | 412 | src[b_index_] * kBYWeight; | |
135 | 412 | y = rounding_shift_right(y, kWeightScale); | |
136 | 412 | int32_t u = (src[b_index_] - y) * kBUWeight + half_; | |
137 | 412 | u = rounding_shift_right(u, kWeightScale); | |
138 | 412 | int32_t v = (src[r_index_] - y) * kRVWeight + half_; | |
139 | 412 | v = rounding_shift_right(v, kWeightScale); | |
140 | 412 | dst[0] = saturating_cast<int32_t, uint8_t>(y); | |
141 | 412 | dst[1] = saturating_cast<int32_t, uint8_t>(u); | |
142 | 412 | dst[2] = saturating_cast<int32_t, uint8_t>(v); | |
143 | 412 | } | |
144 | |||
145 | private: | ||
146 | static constexpr size_t r_index_ = BGR ? 2 : 0; | ||
147 | static constexpr size_t g_index_ = 1; | ||
148 | static constexpr size_t b_index_ = BGR ? 0 : 2; | ||
149 | static constexpr size_t step_ = kAlpha ? 4 : 3; | ||
150 | static constexpr uint32_t half_ = | ||
151 | (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale; | ||
152 | |||
153 | 11280 | static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) { | |
154 | 11280 | return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale); | |
155 | } | ||
156 | }; // end of class RGBToYUVAll<bool BGR, bool kAlpha> | ||
157 | |||
158 | template <typename OperationType, typename ScalarType> | ||
159 | 356 | kleidicv_error_t rgb2yuv_operation(OperationType &operation, | |
160 | const ScalarType *src, size_t src_stride, | ||
161 | ScalarType *dst, size_t dst_stride, | ||
162 | size_t width, size_t height) { | ||
163 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 85 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 85 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 85 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 85 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 85 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 85 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 85 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 85 times.
|
356 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
164 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 81 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 81 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 81 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 81 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 81 times.
|
340 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
165 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 73 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 73 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 73 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 73 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 77 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 73 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 73 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 77 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 73 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 73 times.
|
324 | CHECK_IMAGE_SIZE(width, height); |
166 | |||
167 | 292 | Rectangle rect{width, height}; | |
168 | 292 | Rows src_rows{src, src_stride, operation.input_channels()}; | |
169 | 292 | Rows dst_rows{dst, dst_stride, 3}; | |
170 | |||
171 | 292 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
172 | 292 | return KLEIDICV_OK; | |
173 | 356 | } | |
174 | |||
175 | using RGBToYUV = RGBToYUVAll<false, false>; | ||
176 | using RGBAToYUV = RGBToYUVAll<false, true>; | ||
177 | using BGRToYUV = RGBToYUVAll<true, false>; | ||
178 | using BGRAToYUV = RGBToYUVAll<true, true>; | ||
179 | |||
180 | KLEIDICV_TARGET_FN_ATTRS | ||
181 | 89 | kleidicv_error_t rgb_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
182 | uint8_t *dst, size_t dst_stride, size_t width, | ||
183 | size_t height) { | ||
184 | 89 | RGBToYUV operation; | |
185 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
186 | 89 | height); | |
187 | 89 | } | |
188 | |||
189 | KLEIDICV_TARGET_FN_ATTRS | ||
190 | 89 | kleidicv_error_t rgba_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
191 | uint8_t *dst, size_t dst_stride, size_t width, | ||
192 | size_t height) { | ||
193 | 89 | RGBAToYUV operation; | |
194 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
195 | 89 | height); | |
196 | 89 | } | |
197 | |||
198 | KLEIDICV_TARGET_FN_ATTRS | ||
199 | 89 | kleidicv_error_t bgr_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
200 | uint8_t *dst, size_t dst_stride, size_t width, | ||
201 | size_t height) { | ||
202 | 89 | BGRToYUV operation; | |
203 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
204 | 89 | height); | |
205 | 89 | } | |
206 | |||
207 | KLEIDICV_TARGET_FN_ATTRS | ||
208 | 89 | kleidicv_error_t bgra_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
209 | uint8_t *dst, size_t dst_stride, size_t width, | ||
210 | size_t height) { | ||
211 | 89 | BGRAToYUV operation; | |
212 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
213 | 89 | height); | |
214 | 89 | } | |
215 | |||
216 | } // namespace kleidicv::neon | ||
217 |