| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_YUV422_TO_RGB_SC_H | ||
| 6 | #define KLEIDICV_YUV422_TO_RGB_SC_H | ||
| 7 | |||
| 8 | #include <utility> | ||
| 9 | |||
| 10 | #include "kleidicv/conversions/yuv_to_rgb.h" | ||
| 11 | #include "kleidicv/kleidicv.h" | ||
| 12 | #include "kleidicv/sve2.h" | ||
| 13 | #include "yuv42x_coefficients.h" | ||
| 14 | |||
| 15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 16 | template <size_t b_idx, size_t u_chroma_idx, size_t y_idx, size_t dcn, | ||
| 17 | bool use_unpack_path> | ||
| 18 | class YUV422ToRGBxOrBGRx { | ||
| 19 | public: | ||
| 20 | // Byte offsets for chroma samples inside a 4-byte YUV422 tuple (Y0 U Y1 V). | ||
| 21 | static constexpr size_t u_idx = u_chroma_idx; | ||
| 22 | static constexpr size_t v_idx = (u_idx + 2) % 4; | ||
| 23 | // Source channel count (scn = 2) because YUV422 is interleaved with | ||
| 24 | // two channels per pixel on average: one luma (Y) and one shared | ||
| 25 | // chroma (U or V). | ||
| 26 | static constexpr size_t scn = 2; | ||
| 27 | |||
| 28 | 2080 | static kleidicv_error_t yuv2rgbx_operation(const uint8_t* src, | |
| 29 | size_t src_stride, uint8_t* dst, | ||
| 30 | size_t dst_stride, size_t width, | ||
| 31 | size_t height) KLEIDICV_STREAMING { | ||
| 32 | // Keep track of the current output row being written. | ||
| 33 | 2080 | Rows<uint8_t> dst_rows{dst, dst_stride, dcn}; | |
| 34 | 2080 | auto kVectorLength = svcntb(); | |
| 35 | |||
| 36 | // Loop through rows along the image height. | ||
| 37 |
44/48✓ Branch 0 taken 52 times.
✓ Branch 1 taken 2288 times.
✓ Branch 2 taken 159 times.
✓ Branch 3 taken 6888 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 24 times.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 24 times.
✓ Branch 12 taken 51 times.
✓ Branch 13 taken 2288 times.
✓ Branch 14 taken 156 times.
✓ Branch 15 taken 6888 times.
✓ Branch 16 taken 51 times.
✓ Branch 17 taken 2288 times.
✓ Branch 18 taken 156 times.
✓ Branch 19 taken 6888 times.
✓ Branch 20 taken 51 times.
✓ Branch 21 taken 2288 times.
✓ Branch 22 taken 156 times.
✓ Branch 23 taken 6888 times.
✓ Branch 24 taken 51 times.
✓ Branch 25 taken 2288 times.
✓ Branch 26 taken 156 times.
✓ Branch 27 taken 6888 times.
✓ Branch 28 taken 51 times.
✓ Branch 29 taken 2288 times.
✓ Branch 30 taken 156 times.
✓ Branch 31 taken 6888 times.
✓ Branch 32 taken 51 times.
✓ Branch 33 taken 2288 times.
✓ Branch 34 taken 156 times.
✓ Branch 35 taken 6888 times.
✓ Branch 36 taken 51 times.
✓ Branch 37 taken 2288 times.
✓ Branch 38 taken 156 times.
✓ Branch 39 taken 6888 times.
✓ Branch 40 taken 51 times.
✓ Branch 41 taken 2288 times.
✓ Branch 42 taken 156 times.
✓ Branch 43 taken 6888 times.
✓ Branch 44 taken 51 times.
✓ Branch 45 taken 2288 times.
✓ Branch 46 taken 156 times.
✓ Branch 47 taken 6888 times.
|
93888 | for (size_t y = 0; y < height; y++, src += src_stride) { |
| 38 | 91808 | LoopUnroll2 loop{width, kVectorLength}; | |
| 39 | |||
| 40 | // Use loop.unroll_twice to process two pixels per iteration. | ||
| 41 | // In YUV422, two pixels are interleaved as (Y0, U0, Y1, V0). | ||
| 42 | // These four values produce two RGBx output pixels. By unrolling, | ||
| 43 | // we handle both pixels together in a single iteration, improving | ||
| 44 | // overall efficiency for that loop body. | ||
| 45 | struct UnrollTwiceFunctor { | ||
| 46 | const uint8_t* src_row; | ||
| 47 | Rows<uint8_t>& dst_rows; | ||
| 48 | size_t kVectorLength; | ||
| 49 | |||
| 50 | 576 | KLEIDICV_FORCE_INLINE void operator()(size_t index) const | |
| 51 | KLEIDICV_STREAMING { | ||
| 52 | 576 | svbool_t pg = svptrue_b8(); | |
| 53 | |||
| 54 | // Deinterleave the YUV422 data into separate channels. | ||
| 55 | // svld4() loads 16 groups of 4 bytes: (Y0, U0, Y1, V0). | ||
| 56 | // Because we unroll twice, we must process two pixels at once. | ||
| 57 | // Each pixel contributes two components (Y + chroma), so 4 vectors | ||
| 58 | // are required: Y0, Y1, U, and V. This is why we perform 4 loads | ||
| 59 | // instead of 2 — they directly correspond to the unrolled iteration. | ||
| 60 | 576 | svuint8x4_t yuv422 = svld4(pg, src_row + index * scn); | |
| 61 | 576 | svuint8_t y_even_lanes = svget4(yuv422, y_idx); | |
| 62 | 576 | svuint8_t y_odd_lanes = svget4(yuv422, y_idx + scn); | |
| 63 | 576 | svuint8_t u = svget4(yuv422, u_idx); | |
| 64 | 576 | svuint8_t v = svget4(yuv422, v_idx); | |
| 65 | |||
| 66 | // Convert two output vectors in one go (loop unrolled twice). | ||
| 67 | // The second destination pointer is advanced by kVectorLength * dcn: | ||
| 68 | // - kVectorLength: number of pixels produced per vector | ||
| 69 | // - dcn: destination channels per pixel (3 for RGB, 4 for RGBA) | ||
| 70 | // Because we emit two RGBx vectors per iteration, the second write | ||
| 71 | // starts exactly kVectorLength * dcn bytes after the first. | ||
| 72 | 576 | yuv422_to_rgb( | |
| 73 | pg, y_even_lanes, y_odd_lanes, u, v, | ||
| 74 | 576 | dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)), | |
| 75 | 1152 | dst_rows.as_columns().ptr_at( | |
| 76 | 576 | static_cast<ptrdiff_t>(index + kVectorLength)), | |
| 77 | pg, pg); | ||
| 78 | 576 | } | |
| 79 | }; | ||
| 80 | |||
| 81 | 91808 | loop.unroll_twice(UnrollTwiceFunctor{src, dst_rows, kVectorLength}); | |
| 82 | |||
| 83 | struct RemainingFunctor { | ||
| 84 | const uint8_t* src_row; | ||
| 85 | Rows<uint8_t>& dst_rows; | ||
| 86 | size_t kVectorLength; | ||
| 87 | |||
| 88 | 91808 | KLEIDICV_FORCE_INLINE void operator()(size_t index, size_t length) const | |
| 89 | KLEIDICV_STREAMING { | ||
| 90 | 91808 | svbool_t pg = svwhilelt_b8_u64(index, length); | |
| 91 | 91808 | svbool_t pg_st1 = svwhilelt_b8_u64(index, length); | |
| 92 | 91808 | svbool_t pg_st2 = svwhilelt_b8_u64(index + kVectorLength, length); | |
| 93 | |||
| 94 | 183616 | svuint8x4_t yuv422 = svld4(svwhilelt_b8_u64(0, (length - index) / 2), | |
| 95 | 91808 | src_row + index * scn); | |
| 96 | |||
| 97 | 91808 | svuint8_t y_even_lanes = svget4(yuv422, y_idx); | |
| 98 | 91808 | svuint8_t y_odd_lanes = svget4(yuv422, y_idx + scn); | |
| 99 | 91808 | svuint8_t u = svget4(yuv422, u_idx); | |
| 100 | 91808 | svuint8_t v = svget4(yuv422, v_idx); | |
| 101 | |||
| 102 | // Convert two output vectors in one go (loop unrolled twice). | ||
| 103 | // The second destination pointer is advanced by kVectorLength * dcn: | ||
| 104 | // - kVectorLength: number of pixels produced per vector | ||
| 105 | // - dcn: destination channels per pixel (3 for RGB, 4 for RGBA) | ||
| 106 | // Because we emit two RGBx vectors per iteration, the second write | ||
| 107 | // starts exactly kVectorLength * dcn bytes after the first. | ||
| 108 | 91808 | yuv422_to_rgb( | |
| 109 | pg, y_even_lanes, y_odd_lanes, u, v, | ||
| 110 | 91808 | dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)), | |
| 111 | 183616 | dst_rows.as_columns().ptr_at( | |
| 112 | 91808 | static_cast<ptrdiff_t>(index + kVectorLength)), | |
| 113 | pg_st1, pg_st2); | ||
| 114 | 91808 | } | |
| 115 | }; | ||
| 116 | |||
| 117 | 91808 | loop.remaining(RemainingFunctor{src, dst_rows, kVectorLength}); | |
| 118 | |||
| 119 | 91808 | ++dst_rows; | |
| 120 | 91808 | } | |
| 121 | 2080 | return KLEIDICV_OK; | |
| 122 | 2080 | } | |
| 123 | |||
| 124 | private: | ||
| 125 | // Convert two blocks of YUV422 (deinterleaved) data into RGBx color format. | ||
| 126 | // Each block contains 16 Y values (y0, y1) plus shared U and V values. | ||
| 127 | // The function computes R, G, B channels, normalizes, and stores | ||
| 128 | // results either as RGB (3 channels) or RGBA (4 channels). | ||
| 129 | KLEIDICV_FORCE_INLINE | ||
| 130 | 92384 | static void yuv422_to_rgb(svbool_t& pg, const svuint8_t& y_even_lanes, | |
| 131 | const svuint8_t& y_odd_lanes, const svuint8_t& u, | ||
| 132 | const svuint8_t& v, uint8_t* rgbx0, uint8_t* rgbx1, | ||
| 133 | svbool_t& pg_st1, | ||
| 134 | svbool_t& pg_st2) KLEIDICV_STREAMING { | ||
| 135 | // --- Preprocess Y channel --- | ||
| 136 | // Subtract 16 from luma (Y') with saturation and widen later to 32 bits. | ||
| 137 | 92384 | svuint8_t y_even_lanes_m16 = svqsub(y_even_lanes, static_cast<uint8_t>(16)); | |
| 138 | 92384 | svuint8_t y_odd_lanes_m16 = svqsub(y_odd_lanes, static_cast<uint8_t>(16)); | |
| 139 | |||
| 140 | // Expand Y values into 32-bit lanes for later arithmetic. | ||
| 141 | // Note: "even" and "odd" refer to the pixel position in the YUV422 packing, | ||
| 142 | // not the Y component itself. | ||
| 143 | // | ||
| 144 | // In YUV422, pixels are stored as (Y0, U0, Y1, V0). | ||
| 145 | // - The "even" vectors collect Y0, Y2, Y4, ... → these generate the | ||
| 146 | // even-positioned RGB outputs. | ||
| 147 | // - The "odd" vectors collect Y1, Y3, Y5, ... → these generate the | ||
| 148 | // odd-positioned RGB outputs. | ||
| 149 | // | ||
| 150 | // How it works here: | ||
| 151 | // 1. Split Y into low/high halves using svmovlb()/svmovlt(). | ||
| 152 | // 2. Widen each half to 32-bit lanes with svunpklo()/svunpkhi(). | ||
| 153 | // | ||
| 154 | // Why this may look unusual: | ||
| 155 | // - At first glance, you might expect “_lo_hi” to come from *_hi, but | ||
| 156 | // that would require extra moves and shuffles. | ||
| 157 | // - Current scheme uses only 2× svmov + 4× svunpk per group, which is | ||
| 158 | // efficient since the pipeline can issue two svunpk in parallel. | ||
| 159 | // - A more “intuitive” pairing would need 4× svmov + 2× svunpk, which is | ||
| 160 | // slower because svmov has less bundling freedom. | ||
| 161 | // - Using svmovlb/svmovlt also aligns lanes so later narrowing can run | ||
| 162 | // without additional shuffles, improving overall performance. | ||
| 163 | 92384 | svint32_t y_even_lo_lo{}, y_even_lo_hi{}, y_even_hi_lo{}, y_even_hi_hi{}; | |
| 164 | 92384 | svint32_t y_odd_lo_lo{}, y_odd_lo_hi{}, y_odd_hi_lo{}, y_odd_hi_hi{}; | |
| 165 | // Expand U and V into 32-bit lanes (shared chroma). | ||
| 166 | // In YUV422, each U and V value is shared by a pair of pixels: | ||
| 167 | // (Y_even, U, Y_odd, V) | ||
| 168 | // Therefore, the same U and V vectors are used when computing both | ||
| 169 | // the "even" and "odd" RGB outputs. | ||
| 170 | 92384 | svint32_t v_lo_lo{}, v_lo_hi{}, v_hi_lo{}, v_hi_hi{}; | |
| 171 | 92384 | svint32_t u_lo_lo{}, u_lo_hi{}, u_hi_lo{}, u_hi_hi{}; | |
| 172 | if constexpr (use_unpack_path) { | ||
| 173 | 22880 | svuint16_t y_even_lo = svmovlb(y_even_lanes_m16); | |
| 174 | 22880 | svuint16_t y_even_hi = svmovlt(y_even_lanes_m16); | |
| 175 | 22880 | svuint16_t y_odd_lo = svmovlb(y_odd_lanes_m16); | |
| 176 | 22880 | svuint16_t y_odd_hi = svmovlt(y_odd_lanes_m16); | |
| 177 | 22880 | y_even_lo_lo = svreinterpret_s32(svunpklo(y_even_lo)); | |
| 178 | 22880 | y_even_lo_hi = svreinterpret_s32(svunpklo(y_even_hi)); | |
| 179 | 22880 | y_even_hi_lo = svreinterpret_s32(svunpkhi(y_even_lo)); | |
| 180 | 22880 | y_even_hi_hi = svreinterpret_s32(svunpkhi(y_even_hi)); | |
| 181 | 22880 | y_odd_lo_lo = svreinterpret_s32(svunpklo(y_odd_lo)); | |
| 182 | 22880 | y_odd_lo_hi = svreinterpret_s32(svunpklo(y_odd_hi)); | |
| 183 | 22880 | y_odd_hi_lo = svreinterpret_s32(svunpkhi(y_odd_lo)); | |
| 184 | 22880 | y_odd_hi_hi = svreinterpret_s32(svunpkhi(y_odd_hi)); | |
| 185 | |||
| 186 | 22880 | svuint16_t v_lo = svmovlb(v); | |
| 187 | 22880 | svuint16_t v_hi = svmovlt(v); | |
| 188 | 22880 | svuint16_t u_lo = svmovlb(u); | |
| 189 | 22880 | svuint16_t u_hi = svmovlt(u); | |
| 190 | 22880 | v_lo_lo = svreinterpret_s32(svunpklo(v_lo)); | |
| 191 | 22880 | v_lo_hi = svreinterpret_s32(svunpklo(v_hi)); | |
| 192 | 22880 | v_hi_lo = svreinterpret_s32(svunpkhi(v_lo)); | |
| 193 | 22880 | v_hi_hi = svreinterpret_s32(svunpkhi(v_hi)); | |
| 194 | 22880 | u_lo_lo = svreinterpret_s32(svunpklo(u_lo)); | |
| 195 | 22880 | u_lo_hi = svreinterpret_s32(svunpklo(u_hi)); | |
| 196 | 22880 | u_hi_lo = svreinterpret_s32(svunpkhi(u_lo)); | |
| 197 | 22880 | u_hi_hi = svreinterpret_s32(svunpkhi(u_hi)); | |
| 198 | 22880 | } else { | |
| 199 | 139008 | svuint8_t index0 = svreinterpret_u8_u32( | |
| 200 | 69504 | svindex_u32(0xFFFFFF00, 0x0002)); // 0, 2, 4, 6, ... | |
| 201 | 139008 | svuint8_t index1 = svreinterpret_u8_u32( | |
| 202 | 69504 | svindex_u32(0xFFFFFF00 + svcnth(), 0x0002)); // 8, 10, 12, 14, ... | |
| 203 | 139008 | svuint8_t index2 = svreinterpret_u8_u32( | |
| 204 | 69504 | svindex_u32(0xFFFFFF01, 0x0002)); // 1, 3, 5, 7, ... | |
| 205 | 139008 | svuint8_t index3 = svreinterpret_u8_u32( | |
| 206 | 69504 | svindex_u32(0xFFFFFF01 + svcnth(), 0x0002)); // 9, 11, 13, 15, ... | |
| 207 | |||
| 208 | 69504 | y_even_lo_lo = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index0)); | |
| 209 | 69504 | y_even_lo_hi = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index2)); | |
| 210 | 69504 | y_even_hi_lo = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index1)); | |
| 211 | 69504 | y_even_hi_hi = svreinterpret_s32(svtbl_u8(y_even_lanes_m16, index3)); | |
| 212 | 69504 | y_odd_lo_lo = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index0)); | |
| 213 | 69504 | y_odd_lo_hi = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index2)); | |
| 214 | 69504 | y_odd_hi_lo = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index1)); | |
| 215 | 69504 | y_odd_hi_hi = svreinterpret_s32(svtbl_u8(y_odd_lanes_m16, index3)); | |
| 216 | |||
| 217 | 69504 | v_lo_lo = svreinterpret_s32(svtbl_u8(v, index0)); | |
| 218 | 69504 | v_lo_hi = svreinterpret_s32(svtbl_u8(v, index2)); | |
| 219 | 69504 | v_hi_lo = svreinterpret_s32(svtbl_u8(v, index1)); | |
| 220 | 69504 | v_hi_hi = svreinterpret_s32(svtbl_u8(v, index3)); | |
| 221 | 69504 | u_lo_lo = svreinterpret_s32(svtbl_u8(u, index0)); | |
| 222 | 69504 | u_lo_hi = svreinterpret_s32(svtbl_u8(u, index2)); | |
| 223 | 69504 | u_hi_lo = svreinterpret_s32(svtbl_u8(u, index1)); | |
| 224 | 69504 | u_hi_hi = svreinterpret_s32(svtbl_u8(u, index3)); | |
| 225 | 69504 | } | |
| 226 | |||
| 227 | // Scale the Y (luma) values by the fixed coefficient kYWeight. | ||
| 228 | // This produces the weighted luma contribution (Y') that forms the | ||
| 229 | // base term for all R, G, and B channel calculations in the | ||
| 230 | // YUV → RGB conversion. | ||
| 231 | 92384 | y_even_lo_lo = svmul_x(pg, y_even_lo_lo, kYWeight); | |
| 232 | 92384 | y_even_lo_hi = svmul_x(pg, y_even_lo_hi, kYWeight); | |
| 233 | 92384 | y_even_hi_lo = svmul_x(pg, y_even_hi_lo, kYWeight); | |
| 234 | 92384 | y_even_hi_hi = svmul_x(pg, y_even_hi_hi, kYWeight); | |
| 235 | 92384 | y_odd_lo_lo = svmul_x(pg, y_odd_lo_lo, kYWeight); | |
| 236 | 92384 | y_odd_lo_hi = svmul_x(pg, y_odd_lo_hi, kYWeight); | |
| 237 | 92384 | y_odd_hi_lo = svmul_x(pg, y_odd_hi_lo, kYWeight); | |
| 238 | 92384 | y_odd_hi_hi = svmul_x(pg, y_odd_hi_hi, kYWeight); | |
| 239 | |||
| 240 | // Precompute constant base offsets for R, G, and B channels. | ||
| 241 | // These include the rounding term (1 << (kWeightScale - 1)) and the | ||
| 242 | // bias correction for centering U and V around 128. | ||
| 243 | // This ensures that chroma values (U,V) are properly zero-based before | ||
| 244 | // applying their respective weighting factors in the YUV → RGB formulas. | ||
| 245 | 92384 | constexpr int32_t kOffset = 1 << (kWeightScale - 1); | |
| 246 | 92384 | svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]); | |
| 247 | 184768 | svint32_t g_base = | |
| 248 | 92384 | svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2])); | |
| 249 | 92384 | svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]); | |
| 250 | |||
| 251 | // --- Compute the Red channel --- | ||
| 252 | // Formula: R = Y + (kRV * V) + bias | ||
| 253 | // - Start with r_base_ (rounding + bias correction for V centered at 128). | ||
| 254 | // - Multiply V by kUVWeights[kRVWeightIndex] and add the result to r_base_. | ||
| 255 | // - Reuse the same V contribution for both even and odd pixels, since | ||
| 256 | // chroma is shared in YUV422. | ||
| 257 | // - Finally, add the weighted Y values (even and odd) to produce | ||
| 258 | // the full R channel before normalization and packing to 8 bits. | ||
| 259 | 184768 | svint32_t r_even_lo_lo = | |
| 260 | 92384 | svmla_x(pg, r_base, v_lo_lo, kUVWeights[kRVWeightIndex]); | |
| 261 | 184768 | svint32_t r_even_lo_hi = | |
| 262 | 92384 | svmla_x(pg, r_base, v_lo_hi, kUVWeights[kRVWeightIndex]); | |
| 263 | 184768 | svint32_t r_even_hi_lo = | |
| 264 | 92384 | svmla_x(pg, r_base, v_hi_lo, kUVWeights[kRVWeightIndex]); | |
| 265 | 184768 | svint32_t r_even_hi_hi = | |
| 266 | 92384 | svmla_x(pg, r_base, v_hi_hi, kUVWeights[kRVWeightIndex]); | |
| 267 | |||
| 268 | // Re-interleave and pack the Red channel to u8. | ||
| 269 | // We computed R in four 4-lane chunks split by pixel parity: | ||
| 270 | // r_even_lo_lo (even pixels 0..3), r_even_lo_hi (even 4..7) | ||
| 271 | // r_even_hi_lo (even pixels 8..11), r_even_hi_hi (even 12..15) | ||
| 272 | // The same chroma sums drive the odd pixels, so we reuse the r_even_* | ||
| 273 | // vectors when adding the odd Y lanes below. | ||
| 274 | // normalize_and_pack(...) saturates → shifts → narrows s32→u8 *and* | ||
| 275 | // interleaves even/odd so the output is in raster order: | ||
| 276 | // [R0, R1, R2, R3, ...] (i.e., even0, odd0, even1, odd1, ...). | ||
| 277 | // r0 packs the first 16 R samples; r1 packs the next 16, which come from | ||
| 278 | // the *_hi_* groups. | ||
| 279 | 92384 | svint16_t r0_even = svaddhnb(r_even_lo_lo, y_even_lo_lo); | |
| 280 | 92384 | r0_even = svaddhnt(r0_even, r_even_lo_hi, y_even_lo_hi); | |
| 281 | 92384 | r0_even = svsra(svdup_n_s16(0), r0_even, kWeightScale - 16); | |
| 282 | 92384 | svint16_t r0_odd = svaddhnb(r_even_lo_lo, y_odd_lo_lo); | |
| 283 | 92384 | r0_odd = svaddhnt(r0_odd, r_even_lo_hi, y_odd_lo_hi); | |
| 284 | 92384 | r0_odd = svsra(svdup_n_s16(0), r0_odd, kWeightScale - 16); | |
| 285 | 92384 | svuint8_t r0 = svqxtunt(svqxtunb(r0_even), r0_odd); | |
| 286 | |||
| 287 | 92384 | svint16_t r1_even = svaddhnb(r_even_hi_lo, y_even_hi_lo); | |
| 288 | 92384 | r1_even = svaddhnt(r1_even, r_even_hi_hi, y_even_hi_hi); | |
| 289 | 92384 | r1_even = svsra(svdup_n_s16(0), r1_even, kWeightScale - 16); | |
| 290 | 92384 | svint16_t r1_odd = svaddhnb(r_even_hi_lo, y_odd_hi_lo); | |
| 291 | 92384 | r1_odd = svaddhnt(r1_odd, r_even_hi_hi, y_odd_hi_hi); | |
| 292 | 92384 | r1_odd = svsra(svdup_n_s16(0), r1_odd, kWeightScale - 16); | |
| 293 | 92384 | svuint8_t r1 = svqxtunt(svqxtunb(r1_even), r1_odd); | |
| 294 | |||
| 295 | // --- Compute the Green channel --- | ||
| 296 | // Formula: G = Y + (kGU * U + kGV * V) + bias, reusing the shared chroma | ||
| 297 | // samples for both pixels in each YUV422 pair before normalize/pack | ||
| 298 | // interleaves them back into raster order. | ||
| 299 | 184768 | svint32_t g_even_lo_lo = | |
| 300 | 92384 | svmla_x(pg, g_base, u_lo_lo, kUVWeights[kGUWeightIndex]); | |
| 301 | 184768 | svint32_t g_even_lo_hi = | |
| 302 | 92384 | svmla_x(pg, g_base, u_lo_hi, kUVWeights[kGUWeightIndex]); | |
| 303 | 184768 | svint32_t g_even_hi_lo = | |
| 304 | 92384 | svmla_x(pg, g_base, u_hi_lo, kUVWeights[kGUWeightIndex]); | |
| 305 | 184768 | svint32_t g_even_hi_hi = | |
| 306 | 92384 | svmla_x(pg, g_base, u_hi_hi, kUVWeights[kGUWeightIndex]); | |
| 307 | 92384 | g_even_lo_lo = | |
| 308 | 92384 | svmla_x(pg, g_even_lo_lo, v_lo_lo, kUVWeights[kGVWeightIndex]); | |
| 309 | 92384 | g_even_lo_hi = | |
| 310 | 92384 | svmla_x(pg, g_even_lo_hi, v_lo_hi, kUVWeights[kGVWeightIndex]); | |
| 311 | 92384 | g_even_hi_lo = | |
| 312 | 92384 | svmla_x(pg, g_even_hi_lo, v_hi_lo, kUVWeights[kGVWeightIndex]); | |
| 313 | 92384 | g_even_hi_hi = | |
| 314 | 92384 | svmla_x(pg, g_even_hi_hi, v_hi_hi, kUVWeights[kGVWeightIndex]); | |
| 315 | |||
| 316 | 92384 | svint16_t g0_even = svaddhnb(g_even_lo_lo, y_even_lo_lo); | |
| 317 | 92384 | g0_even = svaddhnt(g0_even, g_even_lo_hi, y_even_lo_hi); | |
| 318 | 92384 | g0_even = svsra(svdup_n_s16(0), g0_even, kWeightScale - 16); | |
| 319 | // Same rationale as for Red: reuse the g_even_* chroma base when adding | ||
| 320 | // the odd Y lanes, avoiding redundant temporaries. | ||
| 321 | 92384 | svint16_t g0_odd = svaddhnb(g_even_lo_lo, y_odd_lo_lo); | |
| 322 | 92384 | g0_odd = svaddhnt(g0_odd, g_even_lo_hi, y_odd_lo_hi); | |
| 323 | 92384 | g0_odd = svsra(svdup_n_s16(0), g0_odd, kWeightScale - 16); | |
| 324 | 92384 | svuint8_t g0 = svqxtunt(svqxtunb(g0_even), g0_odd); | |
| 325 | |||
| 326 | 92384 | svint16_t g1_even = svaddhnb(g_even_hi_lo, y_even_hi_lo); | |
| 327 | 92384 | g1_even = svaddhnt(g1_even, g_even_hi_hi, y_even_hi_hi); | |
| 328 | 92384 | g1_even = svsra(svdup_n_s16(0), g1_even, kWeightScale - 16); | |
| 329 | 92384 | svint16_t g1_odd = svaddhnb(g_even_hi_lo, y_odd_hi_lo); | |
| 330 | 92384 | g1_odd = svaddhnt(g1_odd, g_even_hi_hi, y_odd_hi_hi); | |
| 331 | 92384 | g1_odd = svsra(svdup_n_s16(0), g1_odd, kWeightScale - 16); | |
| 332 | 92384 | svuint8_t g1 = svqxtunt(svqxtunb(g1_even), g1_odd); | |
| 333 | |||
| 334 | // --- Compute the Blue channel --- | ||
| 335 | // Formula: B = Y + (kBU * U) + bias, sharing the U samples across each | ||
| 336 | // even/odd pair before normalize/pack interleaves the results. | ||
| 337 | 184768 | svint32_t b_even_lo_lo = | |
| 338 | 92384 | svmla_x(pg, b_base, u_lo_lo, kUVWeights[kBUWeightIndex]); | |
| 339 | 184768 | svint32_t b_even_lo_hi = | |
| 340 | 92384 | svmla_x(pg, b_base, u_lo_hi, kUVWeights[kBUWeightIndex]); | |
| 341 | 184768 | svint32_t b_even_hi_lo = | |
| 342 | 92384 | svmla_x(pg, b_base, u_hi_lo, kUVWeights[kBUWeightIndex]); | |
| 343 | 184768 | svint32_t b_even_hi_hi = | |
| 344 | 92384 | svmla_x(pg, b_base, u_hi_hi, kUVWeights[kBUWeightIndex]); | |
| 345 | |||
| 346 | 92384 | svint16_t b0_even = svaddhnb(b_even_lo_lo, y_even_lo_lo); | |
| 347 | 92384 | b0_even = svaddhnt(b0_even, b_even_lo_hi, y_even_lo_hi); | |
| 348 | 92384 | b0_even = svsra(svdup_n_s16(0), b0_even, kWeightScale - 16); | |
| 349 | // Blue follows the same pattern, so reuse the b_even_* vectors for odd Y. | ||
| 350 | 92384 | svint16_t b0_odd = svaddhnb(b_even_lo_lo, y_odd_lo_lo); | |
| 351 | 92384 | b0_odd = svaddhnt(b0_odd, b_even_lo_hi, y_odd_lo_hi); | |
| 352 | 92384 | b0_odd = svsra(svdup_n_s16(0), b0_odd, kWeightScale - 16); | |
| 353 | 92384 | svuint8_t b0 = svqxtunt(svqxtunb(b0_even), b0_odd); | |
| 354 | |||
| 355 | 92384 | svint16_t b1_even = svaddhnb(b_even_hi_lo, y_even_hi_lo); | |
| 356 | 92384 | b1_even = svaddhnt(b1_even, b_even_hi_hi, y_even_hi_hi); | |
| 357 | 92384 | b1_even = svsra(svdup_n_s16(0), b1_even, kWeightScale - 16); | |
| 358 | 92384 | svint16_t b1_odd = svaddhnb(b_even_hi_lo, y_odd_hi_lo); | |
| 359 | 92384 | b1_odd = svaddhnt(b1_odd, b_even_hi_hi, y_odd_hi_hi); | |
| 360 | 92384 | b1_odd = svsra(svdup_n_s16(0), b1_odd, kWeightScale - 16); | |
| 361 | 92384 | svuint8_t b1 = svqxtunt(svqxtunb(b1_even), b1_odd); | |
| 362 | |||
| 363 | if constexpr (dcn > 3) { | ||
| 364 | 110688 | svuint8x4_t rgba0 = | |
| 365 | 55344 | svcreate4(b_idx ? r0 : b0, g0, b_idx ? b0 : r0, svdup_n_u8(0xFF)); | |
| 366 | 110688 | svuint8x4_t rgba1 = | |
| 367 | 55344 | svcreate4(b_idx ? r1 : b1, g1, b_idx ? b1 : r1, svdup_n_u8(0xFF)); | |
| 368 | // Store RGBA pixels to memory. | ||
| 369 | 55344 | svst4_u8(pg_st1, rgbx0, rgba0); | |
| 370 | 55344 | svst4_u8(pg_st2, rgbx1, rgba1); | |
| 371 | 55344 | } else { | |
| 372 | 37040 | svuint8x3_t rgb0 = svcreate3(b_idx ? r0 : b0, g0, b_idx ? b0 : r0); | |
| 373 | 37040 | svuint8x3_t rgb1 = svcreate3(b_idx ? r1 : b1, g1, b_idx ? b1 : r1); | |
| 374 | // Store RGB pixels to memory. | ||
| 375 | 37040 | svst3(pg_st1, rgbx0, rgb0); | |
| 376 | 37040 | svst3(pg_st2, rgbx1, rgb1); | |
| 377 | 37040 | } | |
| 378 | 92384 | } | |
| 379 | }; | ||
| 380 | |||
| 381 | template <size_t b_idx, size_t u_chroma_idx, size_t y_idx, size_t dcn> | ||
| 382 | 2080 | kleidicv_error_t dispatch_yuv422_to_rgb(bool use_unpack_path, | |
| 383 | const uint8_t* src, size_t src_stride, | ||
| 384 | uint8_t* dst, size_t dst_stride, | ||
| 385 | size_t width, size_t height) { | ||
| 386 |
22/24✓ Branch 0 taken 52 times.
✓ Branch 1 taken 159 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 51 times.
✓ Branch 7 taken 156 times.
✓ Branch 8 taken 51 times.
✓ Branch 9 taken 156 times.
✓ Branch 10 taken 51 times.
✓ Branch 11 taken 156 times.
✓ Branch 12 taken 51 times.
✓ Branch 13 taken 156 times.
✓ Branch 14 taken 51 times.
✓ Branch 15 taken 156 times.
✓ Branch 16 taken 51 times.
✓ Branch 17 taken 156 times.
✓ Branch 18 taken 51 times.
✓ Branch 19 taken 156 times.
✓ Branch 20 taken 51 times.
✓ Branch 21 taken 156 times.
✓ Branch 22 taken 51 times.
✓ Branch 23 taken 156 times.
|
2080 | if (use_unpack_path) { |
| 387 | 511 | return YUV422ToRGBxOrBGRx<b_idx, u_chroma_idx, y_idx, dcn, | |
| 388 | 511 | true>::yuv2rgbx_operation(src, src_stride, dst, | |
| 389 | 511 | dst_stride, width, | |
| 390 | 511 | height); | |
| 391 | } | ||
| 392 | 1569 | return YUV422ToRGBxOrBGRx<b_idx, u_chroma_idx, y_idx, dcn, | |
| 393 | 1569 | false>::yuv2rgbx_operation(src, src_stride, dst, | |
| 394 | 1569 | dst_stride, width, | |
| 395 | 1569 | height); | |
| 396 | 2080 | } | |
| 397 | |||
| 398 | KLEIDICV_TARGET_FN_ATTRS | ||
| 399 | 2220 | static kleidicv_error_t yuv422_to_rgb_u8_sc( | |
| 400 | const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, | ||
| 401 | size_t width, size_t height, | ||
| 402 | kleidicv_color_conversion_t color_format) KLEIDICV_STREAMING { | ||
| 403 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2217 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 2217 times.
|
2220 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 404 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2214 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 2214 times.
|
2217 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 405 |
6/6✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2203 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 2196 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 2196 times.
|
2214 | CHECK_IMAGE_SIZE(width, height); |
| 406 | |||
| 407 | // YUV422 packs pixels in pairs: (Y0, U, Y1, V). | ||
| 408 | // Therefore, the image width must be at least 2 and always even. | ||
| 409 |
4/4✓ Branch 0 taken 2189 times.
✓ Branch 1 taken 7 times.
✓ Branch 2 taken 39 times.
✓ Branch 3 taken 2150 times.
|
2196 | if (width < 2 || (width % 2) != 0) { |
| 410 | 46 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 411 | } | ||
| 412 | 2150 | bool use_unpack_path = KLEIDICV_UNLIKELY(svcntb() >= 256); | |
| 413 |
13/13✓ Branch 0 taken 207 times.
✓ Branch 1 taken 207 times.
✓ Branch 2 taken 70 times.
✓ Branch 3 taken 211 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 207 times.
✓ Branch 7 taken 207 times.
✓ Branch 8 taken 207 times.
✓ Branch 9 taken 207 times.
✓ Branch 10 taken 207 times.
✓ Branch 11 taken 207 times.
✓ Branch 12 taken 207 times.
|
2150 | switch (color_format) { |
| 414 | case KLEIDICV_YUYV_TO_BGR: | ||
| 415 | 211 | return dispatch_yuv422_to_rgb<0, 1, 0, 3>( | |
| 416 | 211 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 417 | break; | ||
| 418 | case KLEIDICV_UYVY_TO_BGR: | ||
| 419 | 3 | return dispatch_yuv422_to_rgb<0, 0, 1, 3>( | |
| 420 | 3 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 421 | break; | ||
| 422 | case KLEIDICV_YVYU_TO_BGR: | ||
| 423 | 3 | return dispatch_yuv422_to_rgb<0, 3, 0, 3>( | |
| 424 | 3 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 425 | break; | ||
| 426 | case KLEIDICV_YUYV_TO_RGB: | ||
| 427 | 207 | return dispatch_yuv422_to_rgb<2, 1, 0, 3>( | |
| 428 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 429 | break; | ||
| 430 | case KLEIDICV_UYVY_TO_RGB: | ||
| 431 | 207 | return dispatch_yuv422_to_rgb<2, 0, 1, 3>( | |
| 432 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 433 | break; | ||
| 434 | case KLEIDICV_YVYU_TO_RGB: | ||
| 435 | 207 | return dispatch_yuv422_to_rgb<2, 3, 0, 3>( | |
| 436 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 437 | break; | ||
| 438 | case KLEIDICV_YUYV_TO_BGRA: | ||
| 439 | 207 | return dispatch_yuv422_to_rgb<0, 1, 0, 4>( | |
| 440 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 441 | break; | ||
| 442 | case KLEIDICV_UYVY_TO_BGRA: | ||
| 443 | 207 | return dispatch_yuv422_to_rgb<0, 0, 1, 4>( | |
| 444 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 445 | break; | ||
| 446 | case KLEIDICV_YVYU_TO_BGRA: | ||
| 447 | 207 | return dispatch_yuv422_to_rgb<0, 3, 0, 4>( | |
| 448 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 449 | break; | ||
| 450 | case KLEIDICV_YUYV_TO_RGBA: | ||
| 451 | 207 | return dispatch_yuv422_to_rgb<2, 1, 0, 4>( | |
| 452 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 453 | break; | ||
| 454 | case KLEIDICV_UYVY_TO_RGBA: | ||
| 455 | 207 | return dispatch_yuv422_to_rgb<2, 0, 1, 4>( | |
| 456 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 457 | break; | ||
| 458 | case KLEIDICV_YVYU_TO_RGBA: | ||
| 459 | 207 | return dispatch_yuv422_to_rgb<2, 3, 0, 4>( | |
| 460 | 207 | use_unpack_path, src, src_stride, dst, dst_stride, width, height); | |
| 461 | break; | ||
| 462 | default: | ||
| 463 | 70 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 464 | break; | ||
| 465 | } | ||
| 466 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | ||
| 467 | 2220 | } | |
| 468 | |||
| 469 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 470 | |||
| 471 | #endif // KLEIDICV_YUV422_TO_RGB_SC_H | ||
| 472 |