| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <utility> | ||
| 6 | |||
| 7 | #include "kleidicv/conversions/yuv_to_rgb.h" | ||
| 8 | #include "kleidicv/kleidicv.h" | ||
| 9 | #include "kleidicv/neon.h" | ||
| 10 | #include "yuv420_to_rgb_neon.h" | ||
| 11 | |||
| 12 | namespace kleidicv::neon { | ||
| 13 | template <bool BGR, bool kAlpha> | ||
| 14 | class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha>, | ||
| 15 | public UnrollOnce, | ||
| 16 | public TryToAvoidTailLoop { | ||
| 17 | public: | ||
| 18 | using VecTraits = neon::VecTraits<uint8_t>; | ||
| 19 | using ScalarType = VecTraits::ScalarType; | ||
| 20 | using VectorType = VecTraits::VectorType; | ||
| 21 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::de_interleave_indices_; | ||
| 22 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb; | ||
| 23 | using YUV420XToRGBxOrBGRx<BGR, kAlpha>::v_first_; | ||
| 24 | |||
| 25 | 640 | explicit YUVpToRGBxOrBGRx(bool is_yv12) | |
| 26 | 640 | : YUV420XToRGBxOrBGRx<BGR, kAlpha>(is_yv12) {} | |
| 27 | |||
| 28 | KLEIDICV_FORCE_INLINE | ||
| 29 | 144 | void vector_path(VectorType &y0, VectorType &y1, VectorType &y2, | |
| 30 | VectorType &y3, VectorType &u, VectorType &v, | ||
| 31 | ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) { | ||
| 32 | // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore) | ||
| 33 | // These are needed to expand each group of 4 bytes into a full 32-bit lane | ||
| 34 | 144 | uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, | |
| 35 | 2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff}; | ||
| 36 | |||
| 37 | 144 | uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff, | |
| 38 | 6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff}; | ||
| 39 | |||
| 40 | 144 | uint8x16_t index_hi_lo = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, | |
| 41 | 10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff}; | ||
| 42 | |||
| 43 | 144 | uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff, | |
| 44 | 14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff}; | ||
| 45 | |||
| 46 | // Expand each 8-bit channel into 32-bit vectors using table lookup and | ||
| 47 | // reinterpret | ||
| 48 | 144 | int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo)); | |
| 49 | 144 | int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi)); | |
| 50 | 144 | int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo)); | |
| 51 | 144 | int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi)); | |
| 52 | |||
| 53 | 144 | int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo)); | |
| 54 | 144 | int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi)); | |
| 55 | 144 | int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo)); | |
| 56 | 144 | int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi)); | |
| 57 | |||
| 58 | 144 | constexpr size_t step = kAlpha ? 4 * 16 : 3 * 16; | |
| 59 | |||
| 60 | 288 | yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0, | |
| 61 | 144 | rgbx_row_1); | |
| 62 | |||
| 63 | 288 | yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi, | |
| 64 | 144 | rgbx_row_0 + step, rgbx_row_1 + step); | |
| 65 | 144 | } | |
| 66 | |||
| 67 | // Processes inputs which are not long enough to fit a vector. | ||
| 68 | 9936 | void scalar_path(size_t length, const ScalarType *y_row_0, | |
| 69 | const ScalarType *y_row_1, const ScalarType *u_row, | ||
| 70 | const ScalarType *v_row, ScalarType *rgbx_row_0, | ||
| 71 | ScalarType *rgbx_row_1) { | ||
| 72 | 9936 | const uint8_t *y_rows[2] = {y_row_0, y_row_1}; | |
| 73 | 9936 | uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1}; | |
| 74 | |||
| 75 | 9936 | int32_t u_m128 = 0, v_m128 = 0; | |
| 76 |
8/8✓ Branch 0 taken 2484 times.
✓ Branch 1 taken 8248 times.
✓ Branch 2 taken 2484 times.
✓ Branch 3 taken 8248 times.
✓ Branch 4 taken 2484 times.
✓ Branch 5 taken 8248 times.
✓ Branch 6 taken 2484 times.
✓ Branch 7 taken 8248 times.
|
42928 | for (size_t index = 0; index < length; ++index) { |
| 77 | 32992 | disable_loop_vectorization(); | |
| 78 | |||
| 79 | // There is one {U, V} pair for 4 Y values. | ||
| 80 |
8/8✓ Branch 0 taken 4088 times.
✓ Branch 1 taken 4160 times.
✓ Branch 2 taken 4088 times.
✓ Branch 3 taken 4160 times.
✓ Branch 4 taken 4088 times.
✓ Branch 5 taken 4160 times.
✓ Branch 6 taken 4088 times.
✓ Branch 7 taken 4160 times.
|
32992 | if ((index % 2) == 0) { |
| 81 | 16640 | u_m128 = u_row[0] - 128; | |
| 82 | 16640 | v_m128 = v_row[0] - 128; | |
| 83 | 16640 | u_row += 1; | |
| 84 | 16640 | v_row += 1; | |
| 85 |
8/8✓ Branch 0 taken 2080 times.
✓ Branch 1 taken 2080 times.
✓ Branch 2 taken 2080 times.
✓ Branch 3 taken 2080 times.
✓ Branch 4 taken 2080 times.
✓ Branch 5 taken 2080 times.
✓ Branch 6 taken 2080 times.
✓ Branch 7 taken 2080 times.
|
16640 | if (v_first_) { |
| 86 | 8320 | std::swap(u_m128, v_m128); | |
| 87 | 8320 | } | |
| 88 | 16640 | } | |
| 89 | |||
| 90 | 32992 | yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows); | |
| 91 | 32992 | } | |
| 92 | 9936 | } | |
| 93 | }; // end of class YUVpToRGBxOrBGRx<bool, bool> | ||
| 94 | |||
| 95 | using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>; | ||
| 96 | using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>; | ||
| 97 | using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>; | ||
| 98 | using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>; | ||
| 99 | |||
| 100 | template <typename OperationType, typename ScalarType> | ||
| 101 | 640 | kleidicv_error_t yuv2rgbx_operation(OperationType &operation, | |
| 102 | const ScalarType *src, size_t src_stride, | ||
| 103 | ScalarType *dst, size_t dst_stride, | ||
| 104 | size_t width, size_t height, size_t begin, | ||
| 105 | size_t end) { | ||
| 106 |
16/16✓ Branch 0 taken 2 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 158 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 158 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 158 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 158 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 158 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 158 times.
✓ Branch 14 taken 2 times.
✓ Branch 15 taken 158 times.
|
640 | CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2); |
| 107 |
16/16✓ Branch 0 taken 2 times.
✓ Branch 1 taken 156 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 156 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 156 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 156 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 156 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 156 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 156 times.
✓ Branch 14 taken 2 times.
✓ Branch 15 taken 156 times.
|
632 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 108 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 148 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 148 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 152 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 148 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 148 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 152 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 148 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 148 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 152 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 148 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 148 times.
|
624 | CHECK_IMAGE_SIZE(width, height); |
| 109 | |||
| 110 | // Pointer to the start of the U plane. | ||
| 111 | // Since `src` points to a planar YUV buffer, the Y plane comes first, | ||
| 112 | // occupying `src_stride * height` bytes. | ||
| 113 | 592 | const ScalarType *u = src + src_stride * height; | |
| 114 | // Pointer to the start of the V plane. | ||
| 115 | // The V plane follows the U plane. Both U and V planes are | ||
| 116 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and | ||
| 117 | // are often stored in a single contiguous chroma region in memory. Depending | ||
| 118 | // on image height and stride, the starting offset of V may require adjustment | ||
| 119 | // to maintain correct alignment. In particular, when the image height is not | ||
| 120 | // divisible evenly by 4, the chroma rows may not align perfectly, so a | ||
| 121 | // fractional offset (in rows) is applied to calculate the V plane position. | ||
| 122 | // The formula used here accounts for this by adjusting based on row parity, | ||
| 123 | // assuming consistent memory layout across the Y, U, and V planes. | ||
| 124 | 1184 | const ScalarType *v = | |
| 125 | 592 | u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2); | |
| 126 | |||
| 127 | // These indices control how U and V row strides are selected across the image | ||
| 128 | // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to | ||
| 129 | // two luma (Y) rows. However, when the image height is not divisible by 4, | ||
| 130 | // the mapping between chroma and luma rows becomes asymmetric. Specifically, | ||
| 131 | // when `height % 4 == 2`, the start of the V plane is offset by one chroma | ||
| 132 | // row relative to U. | ||
| 133 | // | ||
| 134 | // This results in U and V rows being interleaved with a phase difference, | ||
| 135 | // which must be accounted for during row-wise traversal. To handle this, | ||
| 136 | // `u_index` and `v_index` are used to alternate the stride selection | ||
| 137 | // independently for U and V across the loop. | ||
| 138 | // | ||
| 139 | // This mechanism ensures that memory access patterns remain correct, | ||
| 140 | // especially in layouts where U and V share a contiguous buffer with | ||
| 141 | // alternating strides. Offsetting `v_index` allows the traversal logic to | ||
| 142 | // maintain correct alignment and prevents misaligned or incorrect reads from | ||
| 143 | // the chroma buffer. | ||
| 144 | 592 | size_t u_index = 0; | |
| 145 | 592 | size_t v_index = height % 4 == 2 ? 1 : 0; | |
| 146 | |||
| 147 | // Compute the actual row range in the Y plane (full resolution). | ||
| 148 | // Since each UV row maps to 2 Y rows, we double the begin/end indices. | ||
| 149 | 592 | size_t row_begin = begin * 2; | |
| 150 | 592 | size_t row_end = std::min<size_t>(height, end * 2); | |
| 151 | 592 | size_t row_uv = begin; | |
| 152 | |||
| 153 | // UV stepping pattern: first half of row, then padded second half. | ||
| 154 | // Needed to match row strides between chroma and luma components. | ||
| 155 | 592 | size_t uv_strides[2] = {width / 2, src_stride - width / 2}; | |
| 156 | |||
| 157 | // Calculate starting pointers for Y, U, and V planes at the given stripe | ||
| 158 | // start. | ||
| 159 | 592 | const ScalarType *y0 = src + row_begin * src_stride; | |
| 160 | 592 | u = u + (row_uv / 2) * src_stride; | |
| 161 | 592 | v = v + (row_uv / 2) * src_stride; | |
| 162 | |||
| 163 |
8/8✓ Branch 0 taken 118 times.
✓ Branch 1 taken 30 times.
✓ Branch 2 taken 118 times.
✓ Branch 3 taken 30 times.
✓ Branch 4 taken 118 times.
✓ Branch 5 taken 30 times.
✓ Branch 6 taken 118 times.
✓ Branch 7 taken 30 times.
|
592 | if (row_uv % 2 == 1) { |
| 164 | 120 | u += uv_strides[(u_index++) & 1]; | |
| 165 | 120 | v += uv_strides[(v_index++) & 1]; | |
| 166 | 120 | } | |
| 167 | |||
| 168 | 592 | size_t dcn = operation.output_channels(); | |
| 169 |
8/8✓ Branch 0 taken 2488 times.
✓ Branch 1 taken 148 times.
✓ Branch 2 taken 2488 times.
✓ Branch 3 taken 148 times.
✓ Branch 4 taken 2488 times.
✓ Branch 5 taken 148 times.
✓ Branch 6 taken 2488 times.
✓ Branch 7 taken 148 times.
|
10544 | for (size_t h = row_begin; h < row_end; h += 2) { |
| 170 | 9952 | ScalarType *row0 = dst + dst_stride * h; | |
| 171 | 9952 | ScalarType *row1 = dst + dst_stride * (h + 1); | |
| 172 | 9952 | const ScalarType *y1 = y0 + src_stride; | |
| 173 | |||
| 174 | // Guard for odd-height images. | ||
| 175 | // If the last row in the stripe is unpaired (odd number of rows), | ||
| 176 | // reuse the previous row pointers to avoid out-of-bounds access. | ||
| 177 |
8/8✓ Branch 0 taken 2432 times.
✓ Branch 1 taken 56 times.
✓ Branch 2 taken 2432 times.
✓ Branch 3 taken 56 times.
✓ Branch 4 taken 2432 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 2432 times.
✓ Branch 7 taken 56 times.
|
9952 | if (KLEIDICV_UNLIKELY(h == (row_end - 1))) { |
| 178 | 224 | row1 = row0; | |
| 179 | 224 | y1 = y0; | |
| 180 | 224 | } | |
| 181 | |||
| 182 | 9952 | LoopUnroll2 loop{width, kVectorLength}; | |
| 183 | |||
| 184 | 10096 | loop.unroll_twice([&](size_t index) { | |
| 185 | 144 | uint8x16_t u_vec = vld1q_u8(u + index / 2); | |
| 186 | 144 | uint8x16_t v_vec = vld1q_u8(v + index / 2); | |
| 187 | 144 | uint8x16_t y0_vec = vld1q_u8(y0 + index); | |
| 188 | 144 | uint8x16_t y1_vec = vld1q_u8(y1 + index); | |
| 189 | 144 | uint8x16_t y2_vec = vld1q_u8(y0 + index + kVectorLength); | |
| 190 | 144 | uint8x16_t y3_vec = vld1q_u8(y1 + index + kVectorLength); | |
| 191 | |||
| 192 | 288 | operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec, | |
| 193 | 144 | &row0[index * dcn], &row1[index * dcn]); | |
| 194 | 144 | }); | |
| 195 | |||
| 196 | 19888 | loop.remaining([&](size_t index, size_t length) { | |
| 197 | 19872 | operation.scalar_path(length - index, y0 + index, y1 + index, | |
| 198 | 9936 | u + index / 2, v + index / 2, &row0[index * dcn], | |
| 199 | 9936 | &row1[index * dcn]); | |
| 200 | 9936 | }); | |
| 201 | |||
| 202 | 9952 | y0 += src_stride * 2; | |
| 203 | 9952 | u += uv_strides[(u_index++) & 1]; | |
| 204 | 9952 | v += uv_strides[(v_index++) & 1]; | |
| 205 | 9952 | } | |
| 206 | |||
| 207 | 592 | return KLEIDICV_OK; | |
| 208 | 640 | } | |
| 209 | |||
| 210 | KLEIDICV_TARGET_FN_ATTRS | ||
| 211 | 673 | kleidicv_error_t yuv420p_to_rgb_stripe_u8( | |
| 212 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 213 | size_t width, size_t height, kleidicv_color_conversion_t color_format, | ||
| 214 | size_t begin, size_t end) { | ||
| 215 |
9/9✓ Branch 0 taken 80 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 80 times.
✓ Branch 3 taken 80 times.
✓ Branch 4 taken 80 times.
✓ Branch 5 taken 80 times.
✓ Branch 6 taken 80 times.
✓ Branch 7 taken 33 times.
✓ Branch 8 taken 80 times.
|
673 | switch (color_format) { |
| 216 | case KLEIDICV_YV12_TO_BGR: { | ||
| 217 | 80 | YUVpToBGR operation{true}; | |
| 218 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 219 | 80 | width, height, begin, end); | |
| 220 | 80 | } | |
| 221 | |||
| 222 | case KLEIDICV_YV12_TO_RGB: { | ||
| 223 | 80 | YUVpToRGB operation{true}; | |
| 224 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 225 | 80 | width, height, begin, end); | |
| 226 | 80 | } | |
| 227 | |||
| 228 | case KLEIDICV_YV12_TO_BGRA: { | ||
| 229 | 80 | YUVpToBGRA operation{true}; | |
| 230 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 231 | 80 | width, height, begin, end); | |
| 232 | 80 | } | |
| 233 | |||
| 234 | case KLEIDICV_YV12_TO_RGBA: { | ||
| 235 | 80 | YUVpToRGBA operation{true}; | |
| 236 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 237 | 80 | width, height, begin, end); | |
| 238 | 80 | } | |
| 239 | |||
| 240 | case KLEIDICV_IYUV_TO_BGR: { | ||
| 241 | 80 | YUVpToBGR operation{false}; | |
| 242 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 243 | 80 | width, height, begin, end); | |
| 244 | 80 | } | |
| 245 | |||
| 246 | case KLEIDICV_IYUV_TO_RGB: { | ||
| 247 | 80 | YUVpToRGB operation{false}; | |
| 248 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 249 | 80 | width, height, begin, end); | |
| 250 | 80 | } | |
| 251 | |||
| 252 | case KLEIDICV_IYUV_TO_BGRA: { | ||
| 253 | 80 | YUVpToBGRA operation{false}; | |
| 254 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 255 | 80 | width, height, begin, end); | |
| 256 | 80 | } | |
| 257 | |||
| 258 | case KLEIDICV_IYUV_TO_RGBA: { | ||
| 259 | 80 | YUVpToRGBA operation{false}; | |
| 260 | 160 | return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, | |
| 261 | 80 | width, height, begin, end); | |
| 262 | 80 | } | |
| 263 | |||
| 264 | default: | ||
| 265 | 33 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 266 | } | ||
| 267 | |||
| 268 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | ||
| 269 | 673 | } | |
| 270 | |||
| 271 | } // namespace kleidicv::neon | ||
| 272 |