| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_YUV420P_TO_RGB_SC_H | ||
| 6 | #define KLEIDICV_YUV420P_TO_RGB_SC_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | |||
| 10 | #include "kleidicv/conversions/yuv_to_rgb.h" | ||
| 11 | #include "kleidicv/kleidicv.h" | ||
| 12 | #include "kleidicv/sve2.h" | ||
| 13 | #include "yuv420_to_rgb_sc.h" | ||
| 14 | |||
| 15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 16 | |||
| 17 | template <bool BGR, bool kAlpha> | ||
| 18 | class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha> { | ||
| 19 | public: | ||
| 20 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb; | ||
| 21 | |||
| 22 | 2352 | explicit YUVpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING | |
| 23 | 2352 | : YUV420XToRGBxOrBGRx<BGR, kAlpha>(v_first) {} | |
| 24 | |||
| 25 | // Returns the number of channels in the output image. | ||
| 26 | 2192 | static constexpr size_t output_channels() KLEIDICV_STREAMING { | |
| 27 | 2192 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
| 28 | } | ||
| 29 | |||
| 30 | // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and | ||
| 31 | // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration. | ||
| 32 | KLEIDICV_FORCE_INLINE | ||
| 33 | 40032 | void vector_path(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u, | |
| 34 | svint16_t &v, uint8_t *rgbx_row_0, | ||
| 35 | uint8_t *rgbx_row_1) const KLEIDICV_STREAMING { | ||
| 36 | 40032 | yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1); | |
| 37 | 40032 | } | |
| 38 | }; // end of class YUVpToRGBxOrBGRx<bool, bool> | ||
| 39 | |||
| 40 | using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>; | ||
| 41 | using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>; | ||
| 42 | using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>; | ||
| 43 | using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>; | ||
| 44 | |||
| 45 | template <typename OperationType, typename ScalarType> | ||
| 46 | 2352 | kleidicv_error_t yuv2rgbx_operation(OperationType &operation, | |
| 47 | const ScalarType *src, size_t src_stride, | ||
| 48 | ScalarType *dst, size_t dst_stride, | ||
| 49 | size_t width, size_t height, size_t begin, | ||
| 50 | size_t end) KLEIDICV_STREAMING { | ||
| 51 |
16/16✓ Branch 0 taken 6 times.
✓ Branch 1 taken 582 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 582 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 582 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 582 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 582 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 582 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 582 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 582 times.
|
2352 | CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2); |
| 52 |
16/16✓ Branch 0 taken 6 times.
✓ Branch 1 taken 576 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 576 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 576 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 576 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 576 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 576 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 576 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 576 times.
|
2328 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 53 |
24/24✓ Branch 0 taken 14 times.
✓ Branch 1 taken 562 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 548 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 548 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 562 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 548 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 548 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 562 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 548 times.
✓ Branch 16 taken 28 times.
✓ Branch 17 taken 548 times.
✓ Branch 18 taken 14 times.
✓ Branch 19 taken 562 times.
✓ Branch 20 taken 14 times.
✓ Branch 21 taken 548 times.
✓ Branch 22 taken 28 times.
✓ Branch 23 taken 548 times.
|
2304 | CHECK_IMAGE_SIZE(width, height); |
| 54 | |||
| 55 | // Pointer to the start of the U plane. | ||
| 56 | // Since `src` points to a planar YUV buffer, the Y plane comes first, | ||
| 57 | // occupying `src_stride * height` bytes. | ||
| 58 | 2192 | const ScalarType *u = src + src_stride * height; | |
| 59 | // Pointer to the start of the V plane. | ||
| 60 | // The V plane follows the U plane. Both U and V planes are | ||
| 61 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and | ||
| 62 | // are often stored in a single contiguous chroma region in memory. Depending | ||
| 63 | // on image height and stride, the starting offset of V may require adjustment | ||
| 64 | // to maintain correct alignment. In particular, when the image height is not | ||
| 65 | // divisible evenly by 4, the chroma rows may not align perfectly, so a | ||
| 66 | // fractional offset (in rows) is applied to calculate the V plane position. | ||
| 67 | // The formula used here accounts for this by adjusting based on row parity, | ||
| 68 | // assuming consistent memory layout across the Y, U, and V planes. | ||
| 69 | 4384 | const ScalarType *v = | |
| 70 | 2192 | u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2); | |
| 71 | |||
| 72 | // These indices control how U and V row strides are selected across the image | ||
| 73 | // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to | ||
| 74 | // two luma (Y) rows. However, when the image height is not divisible by 4, | ||
| 75 | // the mapping between chroma and luma rows becomes asymmetric. Specifically, | ||
| 76 | // when `height % 4 == 2`, the start of the V plane is offset by one chroma | ||
| 77 | // row relative to U. | ||
| 78 | // | ||
| 79 | // This results in U and V rows being interleaved with a phase difference, | ||
| 80 | // which must be accounted for during row-wise traversal. To handle this, | ||
| 81 | // `u_index` and `v_index` are used to alternate the stride selection | ||
| 82 | // independently for U and V across the loop. | ||
| 83 | // | ||
| 84 | // This mechanism ensures that memory access patterns remain correct, | ||
| 85 | // especially in layouts where U and V share a contiguous buffer with | ||
| 86 | // alternating strides. Offsetting `v_index` allows the traversal logic to | ||
| 87 | // maintain correct alignment and prevents misaligned or incorrect reads from | ||
| 88 | // the chroma buffer. | ||
| 89 | 2192 | size_t u_index = 0; | |
| 90 | 2192 | size_t v_index = height % 4 == 2 ? 1 : 0; | |
| 91 | |||
| 92 | // Compute the actual row range in the Y plane (full resolution). | ||
| 93 | // Since each UV row maps to 2 Y rows, we double the begin/end indices. | ||
| 94 | 2192 | size_t row_begin = begin * 2; | |
| 95 | 2192 | size_t row_end = std::min<size_t>(height, end * 2); | |
| 96 | 2192 | size_t row_uv = begin; | |
| 97 | |||
| 98 | // UV stepping pattern: first half of row, then padded second half. | ||
| 99 | // Needed to match row strides between chroma and luma components. | ||
| 100 | 2192 | size_t uv_strides[2] = {width / 2, src_stride - width / 2}; | |
| 101 | |||
| 102 | // Calculate starting pointers for Y, U, and V planes at the given stripe | ||
| 103 | // start. | ||
| 104 | 2192 | const ScalarType *y0 = src + row_begin * src_stride; | |
| 105 | 2192 | u = u + (row_uv / 2) * src_stride; | |
| 106 | 2192 | v = v + (row_uv / 2) * src_stride; | |
| 107 | |||
| 108 |
8/8✓ Branch 0 taken 428 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 428 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 428 times.
✓ Branch 5 taken 120 times.
✓ Branch 6 taken 428 times.
✓ Branch 7 taken 120 times.
|
2192 | if (row_uv % 2 == 1) { |
| 109 | 480 | u += uv_strides[(u_index++) & 1]; | |
| 110 | 480 | v += uv_strides[(v_index++) & 1]; | |
| 111 | 480 | } | |
| 112 | |||
| 113 | 2192 | size_t dcn = operation.output_channels(); | |
| 114 | 2192 | const size_t kVectorLength = svcntb(); | |
| 115 |
8/8✓ Branch 0 taken 9770 times.
✓ Branch 1 taken 548 times.
✓ Branch 2 taken 9770 times.
✓ Branch 3 taken 548 times.
✓ Branch 4 taken 9770 times.
✓ Branch 5 taken 548 times.
✓ Branch 6 taken 9770 times.
✓ Branch 7 taken 548 times.
|
41272 | for (size_t h = row_begin; h < row_end; h += 2) { |
| 116 | 39080 | ScalarType *row0 = dst + dst_stride * h; | |
| 117 | 39080 | ScalarType *row1 = dst + dst_stride * (h + 1); | |
| 118 | 39080 | const ScalarType *y1 = y0 + src_stride; | |
| 119 | |||
| 120 | // Guard for odd-height images. | ||
| 121 | // If the last row in the stripe is unpaired (odd number of rows), | ||
| 122 | // reuse the previous row pointers to avoid out-of-bounds access. | ||
| 123 |
8/8✓ Branch 0 taken 9568 times.
✓ Branch 1 taken 202 times.
✓ Branch 2 taken 9568 times.
✓ Branch 3 taken 202 times.
✓ Branch 4 taken 9568 times.
✓ Branch 5 taken 202 times.
✓ Branch 6 taken 9568 times.
✓ Branch 7 taken 202 times.
|
39080 | if (KLEIDICV_UNLIKELY(h == (row_end - 1))) { |
| 124 | 808 | row1 = row0; | |
| 125 | 808 | y1 = y0; | |
| 126 | 808 | } | |
| 127 | |||
| 128 | 39080 | LoopUnroll2 loop{width, svcntb()}; | |
| 129 | |||
| 130 | struct VectorPath2x { | ||
| 131 | const ScalarType *y0, *y1, *u, *v; | ||
| 132 | ScalarType *row0, *row1; | ||
| 133 | const size_t kVectorLength, dcn; | ||
| 134 | OperationType operation; | ||
| 135 | KLEIDICV_FORCE_INLINE | ||
| 136 | 432 | void operator()(size_t index) const KLEIDICV_STREAMING { | |
| 137 | 432 | svbool_t pg = svptrue_b8(); | |
| 138 | 432 | svuint8_t u8_vec = svld1(pg, u + index / 2); | |
| 139 | 432 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
| 140 | 432 | svint16_t u_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(u8_vec)); | |
| 141 | |||
| 142 | 432 | svuint8_t v8_vec = svld1(pg, v + index / 2); | |
| 143 | 432 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
| 144 | 432 | svint16_t v_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(v8_vec)); | |
| 145 | |||
| 146 | #if KLEIDICV_TARGET_SME2 | ||
| 147 | // assume the predicate is full true | ||
| 148 | 144 | svcount_t pg_counter = svptrue_c8(); | |
| 149 | 144 | svuint8x2_t y_even = svld1_x2(pg_counter, y0 + index); | |
| 150 | 144 | svuint8x2_t y_odd = svld1_x2(pg_counter, y1 + index); | |
| 151 | 144 | svuint8_t y0_vec = svget2(y_even, 0); | |
| 152 | 144 | svuint8_t y1_vec = svget2(y_odd, 0); | |
| 153 | 144 | svuint8_t y2_vec = svget2(y_even, 1); | |
| 154 | 144 | svuint8_t y3_vec = svget2(y_odd, 1); | |
| 155 | #else | ||
| 156 | 288 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
| 157 | 288 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
| 158 | 288 | svuint8_t y2_vec = svld1(pg, y0 + index + kVectorLength); | |
| 159 | 288 | svuint8_t y3_vec = svld1(pg, y1 + index + kVectorLength); | |
| 160 | #endif // KLEIDICV_TARGET_SME2 | ||
| 161 | |||
| 162 | 864 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
| 163 | 432 | &row0[index * dcn], &row1[index * dcn]); | |
| 164 | |||
| 165 | 864 | operation.vector_path(pg, y2_vec, y3_vec, u_vec_hi, v_vec_hi, | |
| 166 | 432 | &row0[(index + kVectorLength) * dcn], | |
| 167 | 432 | &row1[(index + kVectorLength) * dcn]); | |
| 168 | 432 | } | |
| 169 | }; | ||
| 170 | 39080 | loop.unroll_twice( | |
| 171 | 39080 | VectorPath2x{y0, y1, u, v, row0, row1, kVectorLength, dcn, operation}); | |
| 172 | |||
| 173 | struct VectorPath1x { | ||
| 174 | const ScalarType *y0, *y1, *u, *v; | ||
| 175 | ScalarType *row0, *row1; | ||
| 176 | const size_t dcn; | ||
| 177 | OperationType operation; | ||
| 178 | KLEIDICV_FORCE_INLINE | ||
| 179 | 448 | void operator()(size_t index) const KLEIDICV_STREAMING { | |
| 180 | 448 | svbool_t pg = svptrue_b8(); | |
| 181 | 448 | svbool_t pg_half = svwhilelt_b8(0UL, svcntb() / 2); | |
| 182 | |||
| 183 | 448 | svuint8_t u8_vec = svld1(pg_half, u + index / 2); | |
| 184 | 448 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
| 185 | |||
| 186 | 448 | svuint8_t v8_vec = svld1(pg_half, v + index / 2); | |
| 187 | 448 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
| 188 | |||
| 189 | 448 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
| 190 | 448 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
| 191 | |||
| 192 | 896 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
| 193 | 448 | &row0[index * dcn], &row1[index * dcn]); | |
| 194 | 448 | } | |
| 195 | }; | ||
| 196 | 39080 | loop.unroll_once(VectorPath1x{y0, y1, u, v, row0, row1, dcn, operation}); | |
| 197 | |||
| 198 | struct RemainingPath { | ||
| 199 | const ScalarType *y0, *y1, *u, *v; | ||
| 200 | ScalarType *row0, *row1; | ||
| 201 | const size_t dcn; | ||
| 202 | OperationType operation; | ||
| 203 | KLEIDICV_FORCE_INLINE | ||
| 204 | 38720 | void operator()(size_t index, size_t length) const KLEIDICV_STREAMING { | |
| 205 | 38720 | svbool_t pg = svwhilelt_b8_u64(index, length); | |
| 206 | 38720 | svbool_t pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) >> 1); | |
| 207 | |||
| 208 | 38720 | svuint8_t u8_vec = svld1(pg_half, u + index / 2); | |
| 209 | 38720 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
| 210 | |||
| 211 | 38720 | svuint8_t v8_vec = svld1(pg_half, v + index / 2); | |
| 212 | 38720 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
| 213 | |||
| 214 | 38720 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
| 215 | 38720 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
| 216 | |||
| 217 | 77440 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
| 218 | 38720 | &row0[index * dcn], &row1[index * dcn]); | |
| 219 | 38720 | } | |
| 220 | }; | ||
| 221 | 39080 | loop.remaining(RemainingPath{y0, y1, u, v, row0, row1, dcn, operation}); | |
| 222 | |||
| 223 | 39080 | y0 += src_stride * 2; | |
| 224 | 39080 | u += uv_strides[(u_index++) & 1]; | |
| 225 | 39080 | v += uv_strides[(v_index++) & 1]; | |
| 226 | 39080 | } | |
| 227 | |||
| 228 | 2192 | return KLEIDICV_OK; | |
| 229 | 2352 | } | |
| 230 | |||
| 231 | KLEIDICV_TARGET_FN_ATTRS | ||
| 232 | 2476 | static kleidicv_error_t yuv420p_to_rgb_stripe_u8_sc( | |
| 233 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 234 | size_t width, size_t height, kleidicv_color_conversion_t color_format, | ||
| 235 | size_t begin, size_t end) KLEIDICV_STREAMING { | ||
| 236 |
9/9✓ Branch 0 taken 294 times.
✓ Branch 1 taken 294 times.
✓ Branch 2 taken 294 times.
✓ Branch 3 taken 294 times.
✓ Branch 4 taken 294 times.
✓ Branch 5 taken 294 times.
✓ Branch 6 taken 294 times.
✓ Branch 7 taken 124 times.
✓ Branch 8 taken 294 times.
|
2476 | switch (color_format) { |
| 237 | case KLEIDICV_YV12_TO_BGR: { | ||
| 238 | 294 | YUVpToBGR operation{true}; | |
| 239 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 240 | 294 | width, height, begin, end); | |
| 241 | 294 | } | |
| 242 | |||
| 243 | case KLEIDICV_YV12_TO_RGB: { | ||
| 244 | 294 | YUVpToRGB operation{true}; | |
| 245 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 246 | 294 | width, height, begin, end); | |
| 247 | 294 | } | |
| 248 | |||
| 249 | case KLEIDICV_YV12_TO_BGRA: { | ||
| 250 | 294 | YUVpToBGRA operation{true}; | |
| 251 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 252 | 294 | width, height, begin, end); | |
| 253 | 294 | } | |
| 254 | |||
| 255 | case KLEIDICV_YV12_TO_RGBA: { | ||
| 256 | 294 | YUVpToRGBA operation{true}; | |
| 257 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 258 | 294 | width, height, begin, end); | |
| 259 | 294 | } | |
| 260 | |||
| 261 | case KLEIDICV_IYUV_TO_BGR: { | ||
| 262 | 294 | YUVpToBGR operation{false}; | |
| 263 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 264 | 294 | width, height, begin, end); | |
| 265 | 294 | } | |
| 266 | |||
| 267 | case KLEIDICV_IYUV_TO_RGB: { | ||
| 268 | 294 | YUVpToRGB operation{false}; | |
| 269 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 270 | 294 | width, height, begin, end); | |
| 271 | 294 | } | |
| 272 | |||
| 273 | case KLEIDICV_IYUV_TO_BGRA: { | ||
| 274 | 294 | YUVpToBGRA operation{false}; | |
| 275 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 276 | 294 | width, height, begin, end); | |
| 277 | 294 | } | |
| 278 | |||
| 279 | case KLEIDICV_IYUV_TO_RGBA: { | ||
| 280 | 294 | YUVpToRGBA operation{false}; | |
| 281 | 588 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 282 | 294 | width, height, begin, end); | |
| 283 | 294 | } | |
| 284 | |||
| 285 | default: | ||
| 286 | 124 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 287 | } | ||
| 288 | |||
| 289 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | ||
| 290 | 2476 | } | |
| 291 | |||
| 292 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 293 | |||
| 294 | #endif // KLEIDICV_YUV420P_TO_RGB_SC_H | ||
| 295 |