| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_YUV_P_TO_RGB_SC_H | ||
| 6 | #define KLEIDICV_YUV_P_TO_RGB_SC_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | |||
| 10 | #include "kleidicv/conversions/yuv_420_to_rgb.h" | ||
| 11 | #include "kleidicv/kleidicv.h" | ||
| 12 | #include "kleidicv/sve2.h" | ||
| 13 | #include "yuv420_to_rgb_sc.h" | ||
| 14 | |||
| 15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 16 | |||
| 17 | template <bool BGR, bool kAlpha> | ||
| 18 | class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha> { | ||
| 19 | public: | ||
| 20 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb; | ||
| 21 | |||
| 22 | 1312 | explicit YUVpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING | |
| 23 | 1312 | : YUV420XToRGBxOrBGRx<BGR, kAlpha>(v_first) {} | |
| 24 | |||
| 25 | // Returns the number of channels in the output image. | ||
| 26 | 1088 | static constexpr size_t output_channels() KLEIDICV_STREAMING { | |
| 27 | 1088 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
| 28 | } | ||
| 29 | |||
| 30 | // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and | ||
| 31 | // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration. | ||
| 32 | 21600 | void vector_path(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u, | |
| 33 | svint16_t &v, uint8_t *rgbx_row_0, | ||
| 34 | uint8_t *rgbx_row_1) KLEIDICV_STREAMING { | ||
| 35 | 21600 | yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1); | |
| 36 | 21600 | } | |
| 37 | }; // end of class YUVpToRGBxOrBGRx<bool, bool> | ||
| 38 | |||
| 39 | using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>; | ||
| 40 | using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>; | ||
| 41 | using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>; | ||
| 42 | using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>; | ||
| 43 | |||
| 44 | template <typename OperationType, typename ScalarType> | ||
| 45 | 1312 | kleidicv_error_t yuv2rgbx_operation(OperationType &operation, | |
| 46 | const ScalarType *src, size_t src_stride, | ||
| 47 | ScalarType *dst, size_t dst_stride, | ||
| 48 | size_t width, size_t height, size_t begin, | ||
| 49 | size_t end) KLEIDICV_STREAMING { | ||
| 50 |
16/16✓ Branch 0 taken 14 times.
✓ Branch 1 taken 314 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 314 times.
✓ Branch 4 taken 14 times.
✓ Branch 5 taken 314 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 314 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 314 times.
✓ Branch 10 taken 14 times.
✓ Branch 11 taken 314 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 314 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 314 times.
|
1312 | CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2); |
| 51 |
16/16✓ Branch 0 taken 14 times.
✓ Branch 1 taken 300 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 300 times.
✓ Branch 4 taken 14 times.
✓ Branch 5 taken 300 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 300 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 300 times.
✓ Branch 10 taken 14 times.
✓ Branch 11 taken 300 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 300 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 300 times.
|
1256 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 52 |
24/24✓ Branch 0 taken 14 times.
✓ Branch 1 taken 286 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 272 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 272 times.
✓ Branch 6 taken 14 times.
✓ Branch 7 taken 286 times.
✓ Branch 8 taken 14 times.
✓ Branch 9 taken 272 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 272 times.
✓ Branch 12 taken 14 times.
✓ Branch 13 taken 286 times.
✓ Branch 14 taken 14 times.
✓ Branch 15 taken 272 times.
✓ Branch 16 taken 28 times.
✓ Branch 17 taken 272 times.
✓ Branch 18 taken 14 times.
✓ Branch 19 taken 286 times.
✓ Branch 20 taken 14 times.
✓ Branch 21 taken 272 times.
✓ Branch 22 taken 28 times.
✓ Branch 23 taken 272 times.
|
1200 | CHECK_IMAGE_SIZE(width, height); |
| 53 | |||
| 54 | // Pointer to the start of the U plane. | ||
| 55 | // Since `src` points to a planar YUV buffer, the Y plane comes first, | ||
| 56 | // occupying `src_stride * height` bytes. | ||
| 57 | 1088 | const ScalarType *u = src + src_stride * height; | |
| 58 | // Pointer to the start of the V plane. | ||
| 59 | // The V plane follows the U plane. Both U and V planes are | ||
| 60 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and | ||
| 61 | // are often stored in a single contiguous chroma region in memory. Depending | ||
| 62 | // on image height and stride, the starting offset of V may require adjustment | ||
| 63 | // to maintain correct alignment. In particular, when the image height is not | ||
| 64 | // divisible evenly by 4, the chroma rows may not align perfectly, so a | ||
| 65 | // fractional offset (in rows) is applied to calculate the V plane position. | ||
| 66 | // The formula used here accounts for this by adjusting based on row parity, | ||
| 67 | // assuming consistent memory layout across the Y, U, and V planes. | ||
| 68 | 2176 | const ScalarType *v = | |
| 69 | 1088 | u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2); | |
| 70 | |||
| 71 | // These indices control how U and V row strides are selected across the image | ||
| 72 | // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to | ||
| 73 | // two luma (Y) rows. However, when the image height is not divisible by 4, | ||
| 74 | // the mapping between chroma and luma rows becomes asymmetric. Specifically, | ||
| 75 | // when `height % 4 == 2`, the start of the V plane is offset by one chroma | ||
| 76 | // row relative to U. | ||
| 77 | // | ||
| 78 | // This results in U and V rows being interleaved with a phase difference, | ||
| 79 | // which must be accounted for during row-wise traversal. To handle this, | ||
| 80 | // `u_index` and `v_index` are used to alternate the stride selection | ||
| 81 | // independently for U and V across the loop. | ||
| 82 | // | ||
| 83 | // This mechanism ensures that memory access patterns remain correct, | ||
| 84 | // especially in layouts where U and V share a contiguous buffer with | ||
| 85 | // alternating strides. Offsetting `v_index` allows the traversal logic to | ||
| 86 | // maintain correct alignment and prevents misaligned or incorrect reads from | ||
| 87 | // the chroma buffer. | ||
| 88 | 1088 | size_t u_index = 0; | |
| 89 | 1088 | size_t v_index = height % 4 == 2 ? 1 : 0; | |
| 90 | |||
| 91 | // Compute the actual row range in the Y plane (full resolution). | ||
| 92 | // Since each UV row maps to 2 Y rows, we double the begin/end indices. | ||
| 93 | 1088 | size_t row_begin = begin * 2; | |
| 94 | 1088 | size_t row_end = std::min<size_t>(height, end * 2); | |
| 95 | 1088 | size_t row_uv = begin; | |
| 96 | |||
| 97 | // UV stepping pattern: first half of row, then padded second half. | ||
| 98 | // Needed to match row strides between chroma and luma components. | ||
| 99 | 1088 | size_t uv_strides[2] = {width / 2, src_stride - width / 2}; | |
| 100 | |||
| 101 | // Calculate starting pointers for Y, U, and V planes at the given stripe | ||
| 102 | // start. | ||
| 103 | 1088 | const ScalarType *y0 = src + row_begin * src_stride; | |
| 104 | 1088 | u = u + row_uv * src_stride / 2; | |
| 105 | 1088 | v = v + row_uv * src_stride / 2; | |
| 106 | |||
| 107 | 1088 | size_t dcn = operation.output_channels(); | |
| 108 | 1088 | const size_t kVectorLength = svcntb(); | |
| 109 |
8/8✓ Branch 0 taken 5162 times.
✓ Branch 1 taken 272 times.
✓ Branch 2 taken 5162 times.
✓ Branch 3 taken 272 times.
✓ Branch 4 taken 5162 times.
✓ Branch 5 taken 272 times.
✓ Branch 6 taken 5162 times.
✓ Branch 7 taken 272 times.
|
21736 | for (size_t h = row_begin; h < row_end; h += 2) { |
| 110 | 20648 | ScalarType *row0 = dst + dst_stride * h; | |
| 111 | 20648 | ScalarType *row1 = dst + dst_stride * (h + 1); | |
| 112 | 20648 | const ScalarType *y1 = y0 + src_stride; | |
| 113 | |||
| 114 | // Guard for odd-height images. | ||
| 115 | // If the last row in the stripe is unpaired (odd number of rows), | ||
| 116 | // reuse the previous row pointers to avoid out-of-bounds access. | ||
| 117 |
8/8✓ Branch 0 taken 5024 times.
✓ Branch 1 taken 138 times.
✓ Branch 2 taken 5024 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 5024 times.
✓ Branch 5 taken 138 times.
✓ Branch 6 taken 5024 times.
✓ Branch 7 taken 138 times.
|
20648 | if (KLEIDICV_UNLIKELY(h == (row_end - 1))) { |
| 118 | 552 | row1 = row0; | |
| 119 | 552 | y1 = y0; | |
| 120 | 552 | } | |
| 121 | |||
| 122 | 20648 | LoopUnroll2 loop{width, svcntb()}; | |
| 123 | |||
| 124 | 21080 | loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
| 125 | 432 | svbool_t pg = svptrue_b8(); | |
| 126 | 432 | svuint8_t u8_vec = svld1(pg, u + index / 2); | |
| 127 | 432 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
| 128 | 432 | svint16_t u_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(u8_vec)); | |
| 129 | |||
| 130 | 432 | svuint8_t v8_vec = svld1(pg, v + index / 2); | |
| 131 | 432 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
| 132 | 432 | svint16_t v_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(v8_vec)); | |
| 133 | |||
| 134 | #if KLEIDICV_TARGET_SME2 | ||
| 135 | // assume the predicate is full true | ||
| 136 | 144 | svcount_t pg_counter = svptrue_c8(); | |
| 137 | 144 | svuint8x2_t y_even = svld1_x2(pg_counter, y0 + index); | |
| 138 | 144 | svuint8x2_t y_odd = svld1_x2(pg_counter, y1 + index); | |
| 139 | 144 | svuint8_t y0_vec = svget2(y_even, 0); | |
| 140 | 144 | svuint8_t y1_vec = svget2(y_odd, 0); | |
| 141 | 144 | svuint8_t y2_vec = svget2(y_even, 1); | |
| 142 | 144 | svuint8_t y3_vec = svget2(y_odd, 1); | |
| 143 | #else | ||
| 144 | 288 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
| 145 | 288 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
| 146 | 288 | svuint8_t y2_vec = svld1(pg, y0 + index + kVectorLength); | |
| 147 | 288 | svuint8_t y3_vec = svld1(pg, y1 + index + kVectorLength); | |
| 148 | #endif // KLEIDICV_TARGET_SME2 | ||
| 149 | |||
| 150 | 864 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
| 151 | 432 | &row0[index * dcn], &row1[index * dcn]); | |
| 152 | |||
| 153 | 864 | operation.vector_path(pg, y2_vec, y3_vec, u_vec_hi, v_vec_hi, | |
| 154 | 432 | &row0[(index + kVectorLength) * dcn], | |
| 155 | 432 | &row1[(index + kVectorLength) * dcn]); | |
| 156 | 432 | }); | |
| 157 | |||
| 158 | 20944 | loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { | |
| 159 | 296 | svbool_t pg = svptrue_b8(); | |
| 160 | 296 | svbool_t pg_half = svwhilelt_b8(0UL, svcntb() / 2); | |
| 161 | |||
| 162 | 296 | svuint8_t u8_vec = svld1(pg_half, u + index / 2); | |
| 163 | 296 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
| 164 | |||
| 165 | 296 | svuint8_t v8_vec = svld1(pg_half, v + index / 2); | |
| 166 | 296 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
| 167 | |||
| 168 | 296 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
| 169 | 296 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
| 170 | |||
| 171 | 592 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
| 172 | 296 | &row0[index * dcn], &row1[index * dcn]); | |
| 173 | 296 | }); | |
| 174 | |||
| 175 | 41088 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
| 176 | 20440 | svbool_t pg = svwhilelt_b8_u64(index, length); | |
| 177 | 20440 | svbool_t pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) >> 1); | |
| 178 | |||
| 179 | 20440 | svuint8_t u8_vec = svld1(pg_half, u + index / 2); | |
| 180 | 20440 | svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); | |
| 181 | |||
| 182 | 20440 | svuint8_t v8_vec = svld1(pg_half, v + index / 2); | |
| 183 | 20440 | svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); | |
| 184 | |||
| 185 | 20440 | svuint8_t y0_vec = svld1(pg, y0 + index); | |
| 186 | 20440 | svuint8_t y1_vec = svld1(pg, y1 + index); | |
| 187 | |||
| 188 | 40880 | operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, | |
| 189 | 20440 | &row0[index * dcn], &row1[index * dcn]); | |
| 190 | 20440 | }); | |
| 191 | |||
| 192 | 20648 | y0 += src_stride * 2; | |
| 193 | 20648 | u += uv_strides[(u_index++) & 1]; | |
| 194 | 20648 | v += uv_strides[(v_index++) & 1]; | |
| 195 | 20648 | } | |
| 196 | |||
| 197 | 1088 | return KLEIDICV_OK; | |
| 198 | 1312 | } | |
| 199 | |||
| 200 | KLEIDICV_TARGET_FN_ATTRS | ||
| 201 | 328 | static kleidicv_error_t yuv_p_to_rgb_stripe_u8_sc( | |
| 202 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 203 | size_t width, size_t height, bool v_first, size_t begin, | ||
| 204 | size_t end) KLEIDICV_STREAMING { | ||
| 205 | 328 | YUVpToRGB operation{v_first}; | |
| 206 | 984 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 207 | 328 | height, begin, end); | |
| 208 | 328 | } | |
| 209 | |||
| 210 | KLEIDICV_TARGET_FN_ATTRS | ||
| 211 | 328 | static kleidicv_error_t yuv_p_to_rgba_stripe_u8_sc( | |
| 212 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 213 | size_t width, size_t height, bool v_first, size_t begin, | ||
| 214 | size_t end) KLEIDICV_STREAMING { | ||
| 215 | 328 | YUVpToRGBA operation{v_first}; | |
| 216 | 984 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 217 | 328 | height, begin, end); | |
| 218 | 328 | } | |
| 219 | |||
| 220 | KLEIDICV_TARGET_FN_ATTRS | ||
| 221 | 328 | static kleidicv_error_t yuv_p_to_bgr_stripe_u8_sc( | |
| 222 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 223 | size_t width, size_t height, bool v_first, size_t begin, | ||
| 224 | size_t end) KLEIDICV_STREAMING { | ||
| 225 | 328 | YUVpToBGR operation{v_first}; | |
| 226 | 984 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 227 | 328 | height, begin, end); | |
| 228 | 328 | } | |
| 229 | |||
| 230 | KLEIDICV_TARGET_FN_ATTRS | ||
| 231 | 328 | static kleidicv_error_t yuv_p_to_bgra_stripe_u8_sc( | |
| 232 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 233 | size_t width, size_t height, bool v_first, size_t begin, | ||
| 234 | size_t end) KLEIDICV_STREAMING { | ||
| 235 | 328 | YUVpToBGRA operation{v_first}; | |
| 236 | 984 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 237 | 328 | height, begin, end); | |
| 238 | 328 | } | |
| 239 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 240 | |||
| 241 | #endif // KLEIDICV_YUV_P_TO_RGB_SC_H | ||
| 242 |