Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_YUV_P_TO_RGB_SC_H | ||
6 | #define KLEIDICV_YUV_P_TO_RGB_SC_H | ||
7 | |||
8 | #include <algorithm> | ||
9 | |||
10 | #include "kleidicv/conversions/yuv_420_to_rgb.h" | ||
11 | #include "kleidicv/kleidicv.h" | ||
12 | #include "kleidicv/sve2.h" | ||
13 | #include "yuv420_to_rgb_sc.h" | ||
14 | |||
15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
16 | |||
17 | template <bool BGR, bool kAlpha> | ||
18 | class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha> { | ||
19 | public: | ||
20 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb; | ||
21 | |||
22 | 932 | explicit YUVpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING | |
23 | 932 | : YUV420XToRGBxOrBGRx<BGR, kAlpha>(v_first) {} | |
24 | |||
25 | // Returns the number of channels in the output image. | ||
26 | 772 | static constexpr size_t output_channels() KLEIDICV_STREAMING { | |
27 | 772 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
28 | } | ||
29 | |||
30 | // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and | ||
31 | // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration. | ||
32 | 15840 | void vector_path(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u, | |
33 | svint16_t &v, uint8_t *rgbx_row_0, | ||
34 | uint8_t *rgbx_row_1) KLEIDICV_STREAMING { | ||
35 | 15840 | yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1); | |
36 | 15840 | } | |
37 | }; // end of class YUVpToRGBxOrBGRx<bool, bool> | ||
38 | |||
39 | using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>; | ||
40 | using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>; | ||
41 | using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>; | ||
42 | using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>; | ||
43 | |||
44 | template <typename OperationType, typename ScalarType> | ||
45 | 932 | kleidicv_error_t yuv2rgbx_operation(OperationType &operation, | |
46 | const ScalarType *src, size_t src_stride, | ||
47 | ScalarType *dst, size_t dst_stride, | ||
48 | size_t width, size_t height, size_t begin, | ||
49 | size_t end) KLEIDICV_STREAMING { | ||
50 |
16/16✓ Branch 0 taken 10 times.
✓ Branch 1 taken 223 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 223 times.
✓ Branch 4 taken 10 times.
✓ Branch 5 taken 223 times.
✓ Branch 6 taken 10 times.
✓ Branch 7 taken 223 times.
✓ Branch 8 taken 10 times.
✓ Branch 9 taken 223 times.
✓ Branch 10 taken 10 times.
✓ Branch 11 taken 223 times.
✓ Branch 12 taken 10 times.
✓ Branch 13 taken 223 times.
✓ Branch 14 taken 10 times.
✓ Branch 15 taken 223 times.
|
932 | CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2); |
51 |
16/16✓ Branch 0 taken 10 times.
✓ Branch 1 taken 213 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 213 times.
✓ Branch 4 taken 10 times.
✓ Branch 5 taken 213 times.
✓ Branch 6 taken 10 times.
✓ Branch 7 taken 213 times.
✓ Branch 8 taken 10 times.
✓ Branch 9 taken 213 times.
✓ Branch 10 taken 10 times.
✓ Branch 11 taken 213 times.
✓ Branch 12 taken 10 times.
✓ Branch 13 taken 213 times.
✓ Branch 14 taken 10 times.
✓ Branch 15 taken 213 times.
|
892 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
52 |
24/24✓ Branch 0 taken 10 times.
✓ Branch 1 taken 203 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 193 times.
✓ Branch 4 taken 20 times.
✓ Branch 5 taken 193 times.
✓ Branch 6 taken 10 times.
✓ Branch 7 taken 203 times.
✓ Branch 8 taken 10 times.
✓ Branch 9 taken 193 times.
✓ Branch 10 taken 20 times.
✓ Branch 11 taken 193 times.
✓ Branch 12 taken 10 times.
✓ Branch 13 taken 203 times.
✓ Branch 14 taken 10 times.
✓ Branch 15 taken 193 times.
✓ Branch 16 taken 20 times.
✓ Branch 17 taken 193 times.
✓ Branch 18 taken 10 times.
✓ Branch 19 taken 203 times.
✓ Branch 20 taken 10 times.
✓ Branch 21 taken 193 times.
✓ Branch 22 taken 20 times.
✓ Branch 23 taken 193 times.
|
852 | CHECK_IMAGE_SIZE(width, height); |
53 | |||
54 | // Pointer to the start of the U plane. | ||
55 | // Since `src` points to a planar YUV buffer, the Y plane comes first, | ||
56 | // occupying `src_stride * height` bytes. | ||
57 | 772 | const ScalarType *u = src + src_stride * height; | |
58 | // Pointer to the start of the V plane. | ||
59 | // The V plane follows the U plane. Both U and V planes are | ||
60 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and | ||
61 | // are often stored in a single contiguous chroma region in memory. Depending | ||
62 | // on image height and stride, the starting offset of V may require adjustment | ||
63 | // to maintain correct alignment. In particular, when the image height is not | ||
64 | // divisible evenly by 4, the chroma rows may not align perfectly, so a | ||
65 | // fractional offset (in rows) is applied to calculate the V plane position. | ||
66 | // The formula used here accounts for this by adjusting based on row parity, | ||
67 | // assuming consistent memory layout across the Y, U, and V planes. | ||
68 | 1544 | const ScalarType *v = | |
69 | 772 | u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2); | |
70 | |||
71 | // These indices control how U and V row strides are selected across the image | ||
72 | // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to | ||
73 | // two luma (Y) rows. However, when the image height is not divisible by 4, | ||
74 | // the mapping between chroma and luma rows becomes asymmetric. Specifically, | ||
75 | // when `height % 4 == 2`, the start of the V plane is offset by one chroma | ||
76 | // row relative to U. | ||
77 | // | ||
78 | // This results in U and V rows being interleaved with a phase difference, | ||
79 | // which must be accounted for during row-wise traversal. To handle this, | ||
80 | // `u_index` and `v_index` are used to alternate the stride selection | ||
81 | // independently for U and V across the loop. | ||
82 | // | ||
83 | // This mechanism ensures that memory access patterns remain correct, | ||
84 | // especially in layouts where U and V share a contiguous buffer with | ||
85 | // alternating strides. Offsetting `v_index` allows the traversal logic to | ||
86 | // maintain correct alignment and prevents misaligned or incorrect reads from | ||
87 | // the chroma buffer. | ||
88 | 772 | size_t u_index = 0; | |
89 | 772 | size_t v_index = height % 4 == 2 ? 1 : 0; | |
90 | |||
91 | // Compute the actual row range in the Y plane (full resolution). | ||
92 | // Since each UV row maps to 2 Y rows, we double the begin/end indices. | ||
93 | 772 | size_t row_begin = begin * 2; | |
94 | 772 | size_t row_end = std::min<size_t>(height, end * 2); | |
95 | 772 | size_t row_uv = begin; | |
96 | |||
97 | // UV stepping pattern: first half of row, then padded second half. | ||
98 | // Needed to match row strides between chroma and luma components. | ||
99 | 772 | size_t uv_strides[2] = {width / 2, src_stride - width / 2}; | |
100 | |||
101 | // Calculate starting pointers for Y, U, and V planes at the given stripe | ||
102 | // start. | ||
103 | 772 | const ScalarType *y0 = src + row_begin * src_stride; | |
104 | 772 | u = u + row_uv * src_stride / 2; | |
105 | 772 | v = v + row_uv * src_stride / 2; | |
106 | |||
107 | 772 | size_t dcn = operation.output_channels(); | |
108 | 772 | const size_t kVectorLength = svcntb(); | |
109 |
8/8✓ Branch 0 taken 3826 times.
✓ Branch 1 taken 193 times.
✓ Branch 2 taken 3826 times.
✓ Branch 3 taken 193 times.
✓ Branch 4 taken 3826 times.
✓ Branch 5 taken 193 times.
✓ Branch 6 taken 3826 times.
✓ Branch 7 taken 193 times.
|
16076 | for (size_t h = row_begin; h < row_end; h += 2) { |
110 | 15304 | ScalarType *row0 = dst + dst_stride * h; | |
111 | 15304 | ScalarType *row1 = dst + dst_stride * (h + 1); | |
112 | 15304 | const ScalarType *y1 = y0 + src_stride; | |
113 | |||
114 | // Guard for odd-height images. | ||
115 | // If the last row in the stripe is unpaired (odd number of rows), | ||
116 | // reuse the previous row pointers to avoid out-of-bounds access. | ||
117 |
8/8✓ Branch 0 taken 3728 times.
✓ Branch 1 taken 98 times.
✓ Branch 2 taken 3728 times.
✓ Branch 3 taken 98 times.
✓ Branch 4 taken 3728 times.
✓ Branch 5 taken 98 times.
✓ Branch 6 taken 3728 times.
✓ Branch 7 taken 98 times.
|
15304 | if (KLEIDICV_UNLIKELY(h == (row_end - 1))) { |
118 | 392 | row1 = row0; | |
119 | 392 | y1 = y0; | |
120 | 392 | } | |
121 | |||
122 | 15304 | LoopUnroll2 loop{width, svcntb()}; | |
123 | |||
124 | 15592 | loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
125 | 288 | svbool_t pg = svptrue_b8(); | |
126 | 288 | svuint8_t u8_vec = svld1(pg, u + index / 2); | |
127 | 288 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
128 | 288 | svint16_t u_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(u8_vec)); | |
129 | |||
130 | 288 | svuint8_t v8_vec = svld1(pg, v + index / 2); | |
131 | 288 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
132 | 288 | svint16_t v_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(v8_vec)); | |
133 | |||
134 | 288 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
135 | 288 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
136 | 288 | svuint8_t y2_vec = svld1(pg, y0 + index + kVectorLength); | |
137 | 288 | svuint8_t y3_vec = svld1(pg, y1 + index + kVectorLength); | |
138 | |||
139 | 576 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
140 | 288 | &row0[index * dcn], &row1[index * dcn]); | |
141 | |||
142 | 576 | operation.vector_path(pg, y2_vec, y3_vec, u_vec_hi, v_vec_hi, | |
143 | 288 | &row0[(index + kVectorLength) * dcn], | |
144 | 288 | &row1[(index + kVectorLength) * dcn]); | |
145 | 288 | }); | |
146 | |||
147 | 15744 | loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { | |
148 | 440 | svbool_t pg = svptrue_b8(); | |
149 | 440 | svbool_t pg_half = svwhilelt_b8(0UL, svcntb() / 2); | |
150 | |||
151 | 440 | svuint8_t u8_vec = svld1(pg_half, u + index / 2); | |
152 | 440 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
153 | |||
154 | 440 | svuint8_t v8_vec = svld1(pg_half, v + index / 2); | |
155 | 440 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
156 | |||
157 | 440 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
158 | 440 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
159 | |||
160 | 880 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
161 | 440 | &row0[index * dcn], &row1[index * dcn]); | |
162 | 440 | }); | |
163 | |||
164 | 30128 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
165 | 14824 | svbool_t pg = svwhilelt_b8_u64(index, length); | |
166 | 14824 | svbool_t pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) >> 1); | |
167 | |||
168 | 14824 | svuint8_t u8_vec = svld1(pg_half, u + index / 2); | |
169 | 14824 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
170 | |||
171 | 14824 | svuint8_t v8_vec = svld1(pg_half, v + index / 2); | |
172 | 14824 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
173 | |||
174 | 14824 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
175 | 14824 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
176 | |||
177 | 29648 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
178 | 14824 | &row0[index * dcn], &row1[index * dcn]); | |
179 | 14824 | }); | |
180 | |||
181 | 15304 | y0 += src_stride * 2; | |
182 | 15304 | u += uv_strides[(u_index++) & 1]; | |
183 | 15304 | v += uv_strides[(v_index++) & 1]; | |
184 | 15304 | } | |
185 | |||
186 | 772 | return KLEIDICV_OK; | |
187 | 932 | } | |
188 | |||
189 | KLEIDICV_TARGET_FN_ATTRS | ||
190 | 233 | static kleidicv_error_t yuv_p_to_rgb_stripe_u8_sc( | |
191 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
192 | size_t width, size_t height, bool v_first, size_t begin, | ||
193 | size_t end) KLEIDICV_STREAMING { | ||
194 | 233 | YUVpToRGB operation{v_first}; | |
195 | 699 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
196 | 233 | height, begin, end); | |
197 | 233 | } | |
198 | |||
199 | KLEIDICV_TARGET_FN_ATTRS | ||
200 | 233 | static kleidicv_error_t yuv_p_to_rgba_stripe_u8_sc( | |
201 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
202 | size_t width, size_t height, bool v_first, size_t begin, | ||
203 | size_t end) KLEIDICV_STREAMING { | ||
204 | 233 | YUVpToRGBA operation{v_first}; | |
205 | 699 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
206 | 233 | height, begin, end); | |
207 | 233 | } | |
208 | |||
209 | KLEIDICV_TARGET_FN_ATTRS | ||
210 | 233 | static kleidicv_error_t yuv_p_to_bgr_stripe_u8_sc( | |
211 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
212 | size_t width, size_t height, bool v_first, size_t begin, | ||
213 | size_t end) KLEIDICV_STREAMING { | ||
214 | 233 | YUVpToBGR operation{v_first}; | |
215 | 699 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
216 | 233 | height, begin, end); | |
217 | 233 | } | |
218 | |||
219 | KLEIDICV_TARGET_FN_ATTRS | ||
220 | 233 | static kleidicv_error_t yuv_p_to_bgra_stripe_u8_sc( | |
221 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
222 | size_t width, size_t height, bool v_first, size_t begin, | ||
223 | size_t end) KLEIDICV_STREAMING { | ||
224 | 233 | YUVpToBGRA operation{v_first}; | |
225 | 699 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
226 | 233 | height, begin, end); | |
227 | 233 | } | |
228 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
229 | |||
230 | #endif // KLEIDICV_YUV_P_TO_RGB_SC_H | ||
231 |