Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <utility> | ||
6 | |||
7 | #include "kleidicv/conversions/yuv_420_to_rgb.h" | ||
8 | #include "kleidicv/kleidicv.h" | ||
9 | #include "kleidicv/neon.h" | ||
10 | #include "yuv420_to_rgb_neon.h" | ||
11 | |||
12 | namespace kleidicv::neon { | ||
13 | template <bool BGR, bool kAlpha> | ||
14 | class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha>, | ||
15 | public UnrollOnce, | ||
16 | public TryToAvoidTailLoop { | ||
17 | public: | ||
18 | using VecTraits = neon::VecTraits<uint8_t>; | ||
19 | using ScalarType = VecTraits::ScalarType; | ||
20 | using VectorType = VecTraits::VectorType; | ||
21 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::de_interleave_indices_; | ||
22 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb; | ||
23 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::v_first_; | ||
24 | |||
25 | 380 | explicit YUVpToRGBxOrBGRx(bool is_yv12) | |
26 | 380 | : YUV420XToRGBxOrBGRx<BGR, kAlpha>(is_yv12) {} | |
27 | |||
28 | 288 | void vector_path(VectorType &y0, VectorType &y1, VectorType &y2, | |
29 | VectorType &y3, VectorType &u, VectorType &v, | ||
30 | ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) { | ||
31 | // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore) | ||
32 | // These are needed to expand each group of 4 bytes into a full 32-bit lane | ||
33 | 288 | uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, | |
34 | 2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff}; | ||
35 | |||
36 | 288 | uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff, | |
37 | 6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff}; | ||
38 | |||
39 | 288 | uint8x16_t index_hi_lo = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, | |
40 | 10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff}; | ||
41 | |||
42 | 288 | uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff, | |
43 | 14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff}; | ||
44 | |||
45 | // Expand each 8-bit channel into 32-bit vectors using table lookup and | ||
46 | // reinterpret | ||
47 | 288 | int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo)); | |
48 | 288 | int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi)); | |
49 | 288 | int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo)); | |
50 | 288 | int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi)); | |
51 | |||
52 | 288 | int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo)); | |
53 | 288 | int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi)); | |
54 | 288 | int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo)); | |
55 | 288 | int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi)); | |
56 | |||
57 | 288 | constexpr size_t step = kAlpha ? 4 * 16 : 3 * 16; | |
58 | |||
59 | 576 | yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0, | |
60 | 288 | rgbx_row_1); | |
61 | |||
62 | 576 | yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi, | |
63 | 288 | rgbx_row_0 + step, rgbx_row_1 + step); | |
64 | 288 | } | |
65 | |||
66 | // Processes inputs which are not long enough to fit a vector. | ||
67 | 5184 | void scalar_path(size_t length, const ScalarType *y_row_0, | |
68 | const ScalarType *y_row_1, const ScalarType *u_row, | ||
69 | const ScalarType *v_row, ScalarType *rgbx_row_0, | ||
70 | ScalarType *rgbx_row_1) { | ||
71 | 5184 | const uint8_t *y_rows[2] = {y_row_0, y_row_1}; | |
72 | 5184 | uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1}; | |
73 | |||
74 | 5184 | int32_t u_m128 = 0, v_m128 = 0; | |
75 |
8/8✓ Branch 0 taken 1296 times.
✓ Branch 1 taken 4646 times.
✓ Branch 2 taken 1296 times.
✓ Branch 3 taken 4646 times.
✓ Branch 4 taken 1296 times.
✓ Branch 5 taken 4646 times.
✓ Branch 6 taken 1296 times.
✓ Branch 7 taken 4646 times.
|
23768 | for (size_t index = 0; index < length; ++index) { |
76 | 18584 | disable_loop_vectorization(); | |
77 | |||
78 | // There is one {U, V} pair for 4 Y values. | ||
79 |
8/8✓ Branch 0 taken 2314 times.
✓ Branch 1 taken 2332 times.
✓ Branch 2 taken 2314 times.
✓ Branch 3 taken 2332 times.
✓ Branch 4 taken 2314 times.
✓ Branch 5 taken 2332 times.
✓ Branch 6 taken 2314 times.
✓ Branch 7 taken 2332 times.
|
18584 | if ((index % 2) == 0) { |
80 | 9328 | u_m128 = u_row[0] - 128; | |
81 | 9328 | v_m128 = v_row[0] - 128; | |
82 | 9328 | u_row += 1; | |
83 | 9328 | v_row += 1; | |
84 |
8/8✓ Branch 0 taken 2062 times.
✓ Branch 1 taken 270 times.
✓ Branch 2 taken 2062 times.
✓ Branch 3 taken 270 times.
✓ Branch 4 taken 2062 times.
✓ Branch 5 taken 270 times.
✓ Branch 6 taken 2062 times.
✓ Branch 7 taken 270 times.
|
9328 | if (v_first_) { |
85 | 1080 | std::swap(u_m128, v_m128); | |
86 | 1080 | } | |
87 | 9328 | } | |
88 | |||
89 | 18584 | yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows); | |
90 | 18584 | } | |
91 | 5184 | } | |
92 | }; // end of class YUVpToRGBxOrBGRx<bool, bool> | ||
93 | |||
94 | using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>; | ||
95 | using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>; | ||
96 | using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>; | ||
97 | using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>; | ||
98 | |||
99 | template <typename OperationType, typename ScalarType> | ||
100 | 380 | kleidicv_error_t yuv2rgbx_operation(OperationType &operation, | |
101 | const ScalarType *src, size_t src_stride, | ||
102 | ScalarType *dst, size_t dst_stride, | ||
103 | size_t width, size_t height, size_t begin, | ||
104 | size_t end) { | ||
105 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 91 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 91 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 91 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 91 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 91 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 91 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 91 times.
|
380 | CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2); |
106 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 87 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 87 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 87 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 87 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 87 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 87 times.
|
364 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
107 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 83 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 83 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 79 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 79 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 83 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 79 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 79 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 83 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 79 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 79 times.
|
348 | CHECK_IMAGE_SIZE(width, height); |
108 | |||
109 | // Pointer to the start of the U plane. | ||
110 | // Since `src` points to a planar YUV buffer, the Y plane comes first, | ||
111 | // occupying `src_stride * height` bytes. | ||
112 | 316 | const ScalarType *u = src + src_stride * height; | |
113 | // Pointer to the start of the V plane. | ||
114 | // The V plane follows the U plane. Both U and V planes are | ||
115 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and | ||
116 | // are often stored in a single contiguous chroma region in memory. Depending | ||
117 | // on image height and stride, the starting offset of V may require adjustment | ||
118 | // to maintain correct alignment. In particular, when the image height is not | ||
119 | // divisible evenly by 4, the chroma rows may not align perfectly, so a | ||
120 | // fractional offset (in rows) is applied to calculate the V plane position. | ||
121 | // The formula used here accounts for this by adjusting based on row parity, | ||
122 | // assuming consistent memory layout across the Y, U, and V planes. | ||
123 | 632 | const ScalarType *v = | |
124 | 316 | u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2); | |
125 | |||
126 | // These indices control how U and V row strides are selected across the image | ||
127 | // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to | ||
128 | // two luma (Y) rows. However, when the image height is not divisible by 4, | ||
129 | // the mapping between chroma and luma rows becomes asymmetric. Specifically, | ||
130 | // when `height % 4 == 2`, the start of the V plane is offset by one chroma | ||
131 | // row relative to U. | ||
132 | // | ||
133 | // This results in U and V rows being interleaved with a phase difference, | ||
134 | // which must be accounted for during row-wise traversal. To handle this, | ||
135 | // `u_index` and `v_index` are used to alternate the stride selection | ||
136 | // independently for U and V across the loop. | ||
137 | // | ||
138 | // This mechanism ensures that memory access patterns remain correct, | ||
139 | // especially in layouts where U and V share a contiguous buffer with | ||
140 | // alternating strides. Offsetting `v_index` allows the traversal logic to | ||
141 | // maintain correct alignment and prevents misaligned or incorrect reads from | ||
142 | // the chroma buffer. | ||
143 | 316 | size_t u_index = 0; | |
144 | 316 | size_t v_index = height % 4 == 2 ? 1 : 0; | |
145 | |||
146 | // Compute the actual row range in the Y plane (full resolution). | ||
147 | // Since each UV row maps to 2 Y rows, we double the begin/end indices. | ||
148 | 316 | size_t row_begin = begin * 2; | |
149 | 316 | size_t row_end = std::min<size_t>(height, end * 2); | |
150 | 316 | size_t row_uv = begin; | |
151 | |||
152 | // UV stepping pattern: first half of row, then padded second half. | ||
153 | // Needed to match row strides between chroma and luma components. | ||
154 | 316 | size_t uv_strides[2] = {width / 2, src_stride - width / 2}; | |
155 | |||
156 | // Calculate starting pointers for Y, U, and V planes at the given stripe | ||
157 | // start. | ||
158 | 316 | const ScalarType *y0 = src + row_begin * src_stride; | |
159 | 316 | u = u + row_uv * src_stride / 2; | |
160 | 316 | v = v + row_uv * src_stride / 2; | |
161 | |||
162 | 316 | size_t dcn = operation.output_channels(); | |
163 |
8/8✓ Branch 0 taken 1336 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1336 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 1336 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 1336 times.
✓ Branch 7 taken 79 times.
|
5660 | for (size_t h = row_begin; h < row_end; h += 2) { |
164 | 5344 | ScalarType *row0 = dst + dst_stride * h; | |
165 | 5344 | ScalarType *row1 = dst + dst_stride * (h + 1); | |
166 | 5344 | const ScalarType *y1 = y0 + src_stride; | |
167 | |||
168 | // Guard for odd-height images. | ||
169 | // If the last row in the stripe is unpaired (odd number of rows), | ||
170 | // reuse the previous row pointers to avoid out-of-bounds access. | ||
171 |
8/8✓ Branch 0 taken 1296 times.
✓ Branch 1 taken 40 times.
✓ Branch 2 taken 1296 times.
✓ Branch 3 taken 40 times.
✓ Branch 4 taken 1296 times.
✓ Branch 5 taken 40 times.
✓ Branch 6 taken 1296 times.
✓ Branch 7 taken 40 times.
|
5344 | if (KLEIDICV_UNLIKELY(h == (row_end - 1))) { |
172 | 160 | row1 = row0; | |
173 | 160 | y1 = y0; | |
174 | 160 | } | |
175 | |||
176 | 5344 | LoopUnroll2 loop{width, kVectorLength}; | |
177 | |||
178 | 5632 | loop.unroll_twice([&](size_t index) { | |
179 | 288 | uint8x16_t u_vec = vld1q_u8(u + index / 2); | |
180 | 288 | uint8x16_t v_vec = vld1q_u8(v + index / 2); | |
181 | 288 | uint8x16_t y0_vec = vld1q_u8(y0 + index); | |
182 | 288 | uint8x16_t y1_vec = vld1q_u8(y1 + index); | |
183 | 288 | uint8x16_t y2_vec = vld1q_u8(y0 + index + kVectorLength); | |
184 | 288 | uint8x16_t y3_vec = vld1q_u8(y1 + index + kVectorLength); | |
185 | |||
186 | 576 | operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec, | |
187 | 288 | &row0[index * dcn], &row1[index * dcn]); | |
188 | 288 | }); | |
189 | |||
190 | 10528 | loop.remaining([&](size_t index, size_t length) { | |
191 | 10368 | operation.scalar_path(length - index, y0 + index, y1 + index, | |
192 | 5184 | u + index / 2, v + index / 2, &row0[index * dcn], | |
193 | 5184 | &row1[index * dcn]); | |
194 | 5184 | }); | |
195 | |||
196 | 5344 | y0 += src_stride * 2; | |
197 | 5344 | u += uv_strides[(u_index++) & 1]; | |
198 | 5344 | v += uv_strides[(v_index++) & 1]; | |
199 | 5344 | } | |
200 | |||
201 | 316 | return KLEIDICV_OK; | |
202 | 380 | } | |
203 | |||
204 | KLEIDICV_TARGET_FN_ATTRS | ||
205 | 95 | kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, | |
206 | uint8_t *dst, size_t dst_stride, | ||
207 | size_t width, size_t height, | ||
208 | bool is_yv12, size_t begin, | ||
209 | size_t end) { | ||
210 | 95 | YUVpToRGB operation{is_yv12}; | |
211 | 285 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
212 | 95 | height, begin, end); | |
213 | 95 | } | |
214 | |||
215 | KLEIDICV_TARGET_FN_ATTRS | ||
216 | 95 | kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, | |
217 | uint8_t *dst, size_t dst_stride, | ||
218 | size_t width, size_t height, | ||
219 | bool is_yv12, size_t begin, | ||
220 | size_t end) { | ||
221 | 95 | YUVpToRGBA operation{is_yv12}; | |
222 | 285 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
223 | 95 | height, begin, end); | |
224 | 95 | } | |
225 | |||
226 | KLEIDICV_TARGET_FN_ATTRS | ||
227 | 95 | kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride, | |
228 | uint8_t *dst, size_t dst_stride, | ||
229 | size_t width, size_t height, | ||
230 | bool is_yv12, size_t begin, | ||
231 | size_t end) { | ||
232 | 95 | YUVpToBGR operation{is_yv12}; | |
233 | 285 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
234 | 95 | height, begin, end); | |
235 | 95 | } | |
236 | |||
237 | KLEIDICV_TARGET_FN_ATTRS | ||
238 | 95 | kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride, | |
239 | uint8_t *dst, size_t dst_stride, | ||
240 | size_t width, size_t height, | ||
241 | bool is_yv12, size_t begin, | ||
242 | size_t end) { | ||
243 | 95 | YUVpToBGRA operation{is_yv12}; | |
244 | 285 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
245 | 95 | height, begin, end); | |
246 | 95 | } | |
247 | } // namespace kleidicv::neon | ||
248 |