KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/conversions/yuv422_to_rgb_neon.cpp
Date:	2026-03-05 15:57:40
	Exec	Total	Coverage
Lines:	240	240	100.0%
Functions:	73	73	100.0%
Branches:	103	103	100.0%
  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <utility>
    
      #include "kleidicv/conversions/yuv_to_rgb.h"
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/neon.h"
    
      #include "yuv42x_coefficients.h"
    
      namespace kleidicv::neon {
    
      template <size_t b_idx, size_t u_chroma_idx, size_t y_idx, size_t dcn>
    
      class YUV422ToRGBxOrBGRx {
    
       public:
    
        // Byte offsets for chroma samples inside a 4-byte YUV422 tuple (Y0 U Y1 V).
    
        static constexpr size_t u_idx = u_chroma_idx;
    
        static constexpr size_t v_idx = (u_idx + 2) % 4;
    
        // Source channel count (scn = 2) because YUV422 is interleaved with
    
        // two channels per pixel on average: one luma (Y) and one shared
    
        // chroma (U or V).
    
        static constexpr size_t scn = 2;
    
      523
        static kleidicv_error_t yuv2rgbx_operation(const uint8_t* src,
    
                                                   size_t src_stride, uint8_t* dst,
    
                                                   size_t dst_stride, size_t width,
    
                                                   size_t height) {
    
      523
          Rows<uint8_t> dst_rows{dst, dst_stride, dcn};
    
        24/24✓ Branch 0 taken 53 times.
✓ Branch 1 taken 2296 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 2296 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 2296 times.
✓ Branch 10 taken 52 times.
✓ Branch 11 taken 2296 times.
✓ Branch 12 taken 52 times.
✓ Branch 13 taken 2296 times.
✓ Branch 14 taken 52 times.
✓ Branch 15 taken 2296 times.
✓ Branch 16 taken 52 times.
✓ Branch 17 taken 2296 times.
✓ Branch 18 taken 52 times.
✓ Branch 19 taken 2296 times.
✓ Branch 20 taken 52 times.
✓ Branch 21 taken 2296 times.
✓ Branch 22 taken 52 times.
✓ Branch 23 taken 2296 times.

      23499
          for (size_t y = 0; y < height; y++, src += src_stride) {
    
      22976
            LoopUnroll2 loop{width, kVectorLength};
    
            // Use loop.unroll_twice to process two pixels per iteration.
    
            // In YUV422, two pixels are interleaved as (Y0, U0, Y1, V0).
    
            // These four values produce two RGBx output pixels. By unrolling,
    
            // we handle both pixels together in a single iteration, improving
    
            // overall efficiency for that loop body.
    
            struct UnrollTwiceFunctor {
    
              const uint8_t* src_row;
    
              Rows<uint8_t>& dst_rows;
    
      384
              KLEIDICV_FORCE_INLINE void operator()(size_t index) const {
    
                // Deinterleave the YUV422 data into separate channels.
    
                // vld4q_u8() loads 16 groups of 4 bytes: (Y0, U0, Y1, V0).
    
                // Because we unroll twice, we must process two pixels at once.
    
                // Each pixel contributes two components (Y + chroma), so 4 vectors
    
                // are required: Y0, Y1, U, and V. This is why we perform 4 loads
    
                // instead of 2 — they directly correspond to the unrolled iteration.
    
      384
                uint8x16x4_t yuv422 = vld4q_u8(src_row + index * scn);
    
      384
                uint8x16_t y_even_lanes = yuv422.val[y_idx];
    
      384
                uint8x16_t y_odd_lanes = yuv422.val[y_idx + scn];
    
      384
                uint8x16_t u = yuv422.val[u_idx];
    
      384
                uint8x16_t v = yuv422.val[v_idx];
    
                // Convert two output vectors in one go (loop unrolled twice).
    
                // The second destination pointer is advanced by kVectorLength * dcn:
    
                //   - kVectorLength: number of pixels produced per vector
    
                //   - dcn: destination channels per pixel (3 for RGB, 4 for RGBA)
    
                // Because we emit two RGBx vectors per iteration, the second write
    
                // starts exactly kVectorLength * dcn bytes after the first.
    
      384
                yuv422_to_rgb(
    
                    y_even_lanes, y_odd_lanes, u, v,
    
      384
                    dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)),
    
      768
                    dst_rows.as_columns().ptr_at(
    
      384
                        static_cast<ptrdiff_t>(index + kVectorLength)));
    
      384
              }
    
            };
    
      22976
            loop.unroll_twice(UnrollTwiceFunctor{src, dst_rows});
    
            // Scalar loop over YUV422 pixels.
    
            struct RemainingFunctor {
    
              const uint8_t* src_row;
    
              Rows<uint8_t>& dst_rows;
    
      22976
              KLEIDICV_FORCE_INLINE void operator()(size_t index,
    
                                                    size_t length) const {
    
        24/24✓ Branch 0 taken 3550 times.
✓ Branch 1 taken 2296 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 3550 times.
✓ Branch 7 taken 2296 times.
✓ Branch 8 taken 3550 times.
✓ Branch 9 taken 2296 times.
✓ Branch 10 taken 3550 times.
✓ Branch 11 taken 2296 times.
✓ Branch 12 taken 3550 times.
✓ Branch 13 taken 2296 times.
✓ Branch 14 taken 3550 times.
✓ Branch 15 taken 2296 times.
✓ Branch 16 taken 3550 times.
✓ Branch 17 taken 2296 times.
✓ Branch 18 taken 3550 times.
✓ Branch 19 taken 2296 times.
✓ Branch 20 taken 3550 times.
✓ Branch 21 taken 2296 times.
✓ Branch 22 taken 3550 times.
✓ Branch 23 taken 2296 times.

      58492
                for (; index < length; index += 2) {
    
      35516
                  const uint8_t u = src_row[(index * scn) + u_idx];
    
      35516
                  const uint8_t v = src_row[(index * scn) + v_idx];
    
      35516
                  const uint8_t y0 = src_row[(index * scn) + y_idx];
    
      35516
                  const uint8_t y1 = src_row[(index * scn) + y_idx + scn];
    
      35516
                  const int32_t u_m128 = static_cast<int32_t>(u) - 128;
    
      35516
                  const int32_t v_m128 = static_cast<int32_t>(v) - 128;
    
      35516
                  const uint8_t y_rows[2] = {y0, y1};
    
      106548
                  uint8_t* rgbx_rows[2] = {
    
      35516
                      dst_rows.as_columns().ptr_at(static_cast<ptrdiff_t>(index)),
    
      71032
                      dst_rows.as_columns().ptr_at(
    
      35516
                          static_cast<ptrdiff_t>(index + 1))};
    
      35516
                  yuv422_to_rgb(y_rows, u_m128, v_m128, rgbx_rows);
    
      35516
                }
    
      22976
              }
    
            };
    
      22976
            loop.remaining(RemainingFunctor{src, dst_rows});
    
      22976
            ++dst_rows;
    
      22976
          }
    
      523
          return KLEIDICV_OK;
    
      523
        }
    
       private:
    
        KLEIDICV_FORCE_INLINE
    
      2304
        static uint8x16_t normalize_and_pack(int32x4_t vec_0, int32x4_t vec_1,
    
                                             int32x4_t vec_2, int32x4_t vec_3) {
    
      2304
          int16x4_t tmp_lo_lo = vqshrun_n_s32(vec_0, kWeightScale - 8);
    
      4608
          int16x8_t tmp_lo_hi =
    
      2304
              vqshrun_high_n_s32(tmp_lo_lo, vec_1, kWeightScale - 8);
    
      2304
          int16x4_t tmp_hi_lo = vqshrun_n_s32(vec_2, kWeightScale - 8);
    
      4608
          int16x8_t tmp_hi_hi =
    
      2304
              vqshrun_high_n_s32(tmp_hi_lo, vec_3, kWeightScale - 8);
    
      4608
          uint8x16_t output =
    
      2304
              vtrn2q_u8(vreinterpretq_u8(tmp_lo_hi), vreinterpretq_u8(tmp_hi_hi));
    
      4608
          return output;
    
      2304
        }
    
        // Convert two blocks of YUV422 (deinterleaved) data into RGBx color format.
    
        // Each block contains 16 Y values (y_even_lanes, y_odd_lanes) plus shared U
    
        // and V values. The function computes R, G, B channels, normalizes, and
    
        // stores results either as RGB (3 channels) or RGBA (4 channels).
    
        KLEIDICV_FORCE_INLINE
    
      384
        static void yuv422_to_rgb(const uint8x16_t& y_even_lanes,
    
                                  const uint8x16_t& y_odd_lanes, const uint8x16_t& u,
    
                                  const uint8x16_t& v, uint8_t* rgbx0,
    
                                  uint8_t* rgbx1) {
    
          // --- Preprocess Y channel ---
    
          // Subtract 16 from luma (Y') with saturation and widen later to 32 bits.
    
      384
          uint8x16_t y_even_lanes_m16 = vqsubq_u8(y_even_lanes, vdupq_n_u8(16));
    
      384
          uint8x16_t y_odd_lanes_m16 = vqsubq_u8(y_odd_lanes, vdupq_n_u8(16));
    
          // --- Zero-extend (8 → 32) via table lookups ---
    
          // The masks feed vqtbl1q_u8 so each lookup pulls 4 bytes out of a 16-lane
    
          // u8 vector and places the selected byte as the least-significant byte of a
    
          // 32-bit lane while zeroing the remaining three bytes.
    
          // vqtbl1q_u8 inserts 0 for indices ≥ 16 (e.g., 0xFF), letting us build
    
          // [x,0,0,0] groups that we reinterpret as int32x4_t to get u8→s32 lanes in
    
          // one step.
    
      384
          const uint8x16_t index_0 = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff,
    
                                      2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff};
    
      384
          const uint8x16_t index_1 = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff,
    
                                      6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff};
    
      384
          const uint8x16_t index_2 = {8,  0xff, 0xff, 0xff, 9,  0xff, 0xff, 0xff,
    
                                      10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff};
    
      384
          const uint8x16_t index_3 = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff,
    
                                      14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
    
          // Expand Y values into 32-bit lanes for later arithmetic.
    
          // Note: "even" and "odd" describe the pixel position in the YUV422 packing,
    
          // not the Y component itself.
    
          //
    
          // In YUV422, pixels are stored as (Y0, U0, Y1, V0).
    
          // - The "even" vectors collect Y0, Y2, Y4, ... → these generate the
    
          //   even-positioned RGB outputs.
    
          // - The "odd" vectors collect Y1, Y3, Y5, ... → these generate the
    
          //   odd-positioned RGB outputs.
    
      768
          int32x4_t y_even_lo_lo =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_0));
    
      768
          int32x4_t y_even_lo_hi =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_1));
    
      768
          int32x4_t y_even_hi_lo =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_2));
    
      768
          int32x4_t y_even_hi_hi =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_even_lanes_m16, index_3));
    
      768
          int32x4_t y_odd_lo_lo =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_0));
    
      768
          int32x4_t y_odd_lo_hi =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_1));
    
      768
          int32x4_t y_odd_hi_lo =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_2));
    
      768
          int32x4_t y_odd_hi_hi =
    
      384
              vreinterpretq_s32_u8(vqtbl1q_u8(y_odd_lanes_m16, index_3));
    
          // Expand U and V into 32-bit lanes (shared chroma).
    
          // In YUV422, each U and V value is shared by a pair of pixels:
    
          //   (Y_even, U, Y_odd, V)
    
          // Therefore, the same U and V vectors are used when computing both
    
          // the "even" and "odd" RGB outputs.
    
      384
          int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_0));
    
      384
          int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_1));
    
      384
          int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_2));
    
      384
          int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_3));
    
      384
          int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_0));
    
      384
          int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_1));
    
      384
          int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_2));
    
      384
          int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_3));
    
          // Scale the Y (luma) values by the fixed coefficient kYWeight.
    
          // This produces the weighted luma contribution (Y') that forms the
    
          // base term for all R, G, and B channel calculations in the
    
          // YUV → RGB conversion.
    
      384
          y_even_lo_lo = vmulq_n_s32(y_even_lo_lo, kYWeight);
    
      384
          y_even_lo_hi = vmulq_n_s32(y_even_lo_hi, kYWeight);
    
      384
          y_even_hi_lo = vmulq_n_s32(y_even_hi_lo, kYWeight);
    
      384
          y_even_hi_hi = vmulq_n_s32(y_even_hi_hi, kYWeight);
    
      384
          y_odd_lo_lo = vmulq_n_s32(y_odd_lo_lo, kYWeight);
    
      384
          y_odd_lo_hi = vmulq_n_s32(y_odd_lo_hi, kYWeight);
    
      384
          y_odd_hi_lo = vmulq_n_s32(y_odd_hi_lo, kYWeight);
    
      384
          y_odd_hi_hi = vmulq_n_s32(y_odd_hi_hi, kYWeight);
    
          // Precompute constant base offsets for R, G, and B channels.
    
          // These include the rounding term (1 << (kWeightScale - 1)) and the
    
          // bias correction for centering U and V around 128.
    
          // This ensures that chroma values (U,V) are properly zero-based before
    
          // applying their respective weighting factors in the YUV → RGB formulas.
    
      768
          int32x4_t r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
    
      384
                                        128 * kUVWeights[kRVWeightIndex])};
    
      768
          int32x4_t g_base_{vdupq_n_s32(
    
      384
              (1 << (kWeightScale - 1)) -
    
      384
              128 * (kUVWeights[kGUWeightIndex] + kUVWeights[kGVWeightIndex]))};
    
      768
          int32x4_t b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
    
      384
                                        128 * kUVWeights[kBUWeightIndex])};
    
          // --- Compute the Red channel ---
    
          // Formula: R = Y + (kRV * V) + bias
    
          // - Start with r_base_ (rounding + bias correction for V centered at 128).
    
          // - Multiply V by kUVWeights[kRVWeightIndex] and add the result to r_base_.
    
          // - Reuse the same V contribution for both even and odd pixels, since
    
          //   chroma is shared in YUV422.
    
          // - Finally, add the weighted Y values (even and odd) to produce
    
          //   the full R channel before normalization and packing to 8 bits.
    
      768
          int32x4_t r_even_lo_lo =
    
      384
              vmlaq_n_s32(r_base_, v_lo_lo, kUVWeights[kRVWeightIndex]);
    
      768
          int32x4_t r_even_lo_hi =
    
      384
              vmlaq_n_s32(r_base_, v_lo_hi, kUVWeights[kRVWeightIndex]);
    
      768
          int32x4_t r_even_hi_lo =
    
      384
              vmlaq_n_s32(r_base_, v_hi_lo, kUVWeights[kRVWeightIndex]);
    
      768
          int32x4_t r_even_hi_hi =
    
      384
              vmlaq_n_s32(r_base_, v_hi_hi, kUVWeights[kRVWeightIndex]);
    
          // Odd pixels reuse the same chroma base, so compute them before the even
    
          // registers are updated with their Y contribution.
    
      384
          int32x4_t r_odd_lo_lo = vaddq_s32(r_even_lo_lo, y_odd_lo_lo);
    
      384
          int32x4_t r_odd_lo_hi = vaddq_s32(r_even_lo_hi, y_odd_lo_hi);
    
      384
          int32x4_t r_odd_hi_lo = vaddq_s32(r_even_hi_lo, y_odd_hi_lo);
    
      384
          int32x4_t r_odd_hi_hi = vaddq_s32(r_even_hi_hi, y_odd_hi_hi);
    
      384
          r_even_lo_lo = vaddq_s32(r_even_lo_lo, y_even_lo_lo);
    
      384
          r_even_lo_hi = vaddq_s32(r_even_lo_hi, y_even_lo_hi);
    
      384
          r_even_hi_lo = vaddq_s32(r_even_hi_lo, y_even_hi_lo);
    
      384
          r_even_hi_hi = vaddq_s32(r_even_hi_hi, y_even_hi_hi);
    
          // Re-interleave and pack the Red channel to u8.
    
          // We computed R in four 4-lane chunks split by pixel parity:
    
          //   r_even_lo_lo (even pixels 0..3),   r_even_lo_hi (even 4..7)
    
          //   r_odd_lo_lo  (odd  pixels 0..3),   r_odd_lo_hi  (odd  4..7)
    
          // normalize_and_pack(...) saturates → shifts → narrows s32→u8 *and*
    
          // interleaves even/odd so the output is in raster order:
    
          //   [R0, R1, R2, R3, ...] (i.e., even0, odd0, even1, odd1, ...).
    
          // r0 packs the first 16 R samples; r1 packs the next 16, which come from
    
          // the *_hi_* groups.
    
      768
          uint8x16_t r0 = normalize_and_pack(r_even_lo_lo, r_even_lo_hi, r_odd_lo_lo,
    
      384
                                             r_odd_lo_hi);
    
      768
          uint8x16_t r1 = normalize_and_pack(r_even_hi_lo, r_even_hi_hi, r_odd_hi_lo,
    
      384
                                             r_odd_hi_hi);
    
          // --- Compute the Green channel ---
    
          // Formula: G = Y + (kGU * U + kGV * V) + bias, reusing the shared U and V
    
          // samples for both pixels in each YUV422 pair. normalize_and_pack(...)
    
          // narrows back to u8 and interleaves even/odd results into raster order.
    
      768
          int32x4_t g_even_lo_lo =
    
      384
              vmlaq_n_s32(g_base_, u_lo_lo, kUVWeights[kGUWeightIndex]);
    
      768
          int32x4_t g_even_lo_hi =
    
      384
              vmlaq_n_s32(g_base_, u_lo_hi, kUVWeights[kGUWeightIndex]);
    
      768
          int32x4_t g_even_hi_lo =
    
      384
              vmlaq_n_s32(g_base_, u_hi_lo, kUVWeights[kGUWeightIndex]);
    
      768
          int32x4_t g_even_hi_hi =
    
      384
              vmlaq_n_s32(g_base_, u_hi_hi, kUVWeights[kGUWeightIndex]);
    
      384
          g_even_lo_lo =
    
      384
              vmlaq_n_s32(g_even_lo_lo, v_lo_lo, kUVWeights[kGVWeightIndex]);
    
      384
          g_even_lo_hi =
    
      384
              vmlaq_n_s32(g_even_lo_hi, v_lo_hi, kUVWeights[kGVWeightIndex]);
    
      384
          g_even_hi_lo =
    
      384
              vmlaq_n_s32(g_even_hi_lo, v_hi_lo, kUVWeights[kGVWeightIndex]);
    
      384
          g_even_hi_hi =
    
      384
              vmlaq_n_s32(g_even_hi_hi, v_hi_hi, kUVWeights[kGVWeightIndex]);
    
          // Same rationale as for Red: capture odd pixels before the even lanes add
    
          // Y.
    
      384
          int32x4_t g_odd_lo_lo = vaddq_s32(g_even_lo_lo, y_odd_lo_lo);
    
      384
          int32x4_t g_odd_lo_hi = vaddq_s32(g_even_lo_hi, y_odd_lo_hi);
    
      384
          int32x4_t g_odd_hi_lo = vaddq_s32(g_even_hi_lo, y_odd_hi_lo);
    
      384
          int32x4_t g_odd_hi_hi = vaddq_s32(g_even_hi_hi, y_odd_hi_hi);
    
      384
          g_even_lo_lo = vaddq_s32(g_even_lo_lo, y_even_lo_lo);
    
      384
          g_even_lo_hi = vaddq_s32(g_even_lo_hi, y_even_lo_hi);
    
      384
          g_even_hi_lo = vaddq_s32(g_even_hi_lo, y_even_hi_lo);
    
      384
          g_even_hi_hi = vaddq_s32(g_even_hi_hi, y_even_hi_hi);
    
      768
          uint8x16_t g0 = normalize_and_pack(g_even_lo_lo, g_even_lo_hi, g_odd_lo_lo,
    
      384
                                             g_odd_lo_hi);
    
      768
          uint8x16_t g1 = normalize_and_pack(g_even_hi_lo, g_even_hi_hi, g_odd_hi_lo,
    
      384
                                             g_odd_hi_hi);
    
          // --- Compute the Blue channel ---
    
          // Formula: B = Y + (kBU * U) + bias, sharing the same U samples across the
    
          // even/odd pair before normalize_and_pack(...) interleaves the outputs.
    
      768
          int32x4_t b_even_lo_lo =
    
      384
              vmlaq_n_s32(b_base_, u_lo_lo, kUVWeights[kBUWeightIndex]);
    
      768
          int32x4_t b_even_lo_hi =
    
      384
              vmlaq_n_s32(b_base_, u_lo_hi, kUVWeights[kBUWeightIndex]);
    
      768
          int32x4_t b_even_hi_lo =
    
      384
              vmlaq_n_s32(b_base_, u_hi_lo, kUVWeights[kBUWeightIndex]);
    
      768
          int32x4_t b_even_hi_hi =
    
      384
              vmlaq_n_s32(b_base_, u_hi_hi, kUVWeights[kBUWeightIndex]);
    
          // Blue follows the same ordering so odd lanes are finalized before evens.
    
      384
          int32x4_t b_odd_lo_lo = vaddq_s32(b_even_lo_lo, y_odd_lo_lo);
    
      384
          int32x4_t b_odd_lo_hi = vaddq_s32(b_even_lo_hi, y_odd_lo_hi);
    
      384
          int32x4_t b_odd_hi_lo = vaddq_s32(b_even_hi_lo, y_odd_hi_lo);
    
      384
          int32x4_t b_odd_hi_hi = vaddq_s32(b_even_hi_hi, y_odd_hi_hi);
    
      384
          b_even_lo_lo = vaddq_s32(b_even_lo_lo, y_even_lo_lo);
    
      384
          b_even_lo_hi = vaddq_s32(b_even_lo_hi, y_even_lo_hi);
    
      384
          b_even_hi_lo = vaddq_s32(b_even_hi_lo, y_even_hi_lo);
    
      384
          b_even_hi_hi = vaddq_s32(b_even_hi_hi, y_even_hi_hi);
    
      768
          uint8x16_t b0 = normalize_and_pack(b_even_lo_lo, b_even_lo_hi, b_odd_lo_lo,
    
      384
                                             b_odd_lo_hi);
    
      768
          uint8x16_t b1 = normalize_and_pack(b_even_hi_lo, b_even_hi_hi, b_odd_hi_lo,
    
      384
                                             b_odd_hi_hi);
    
          if constexpr (dcn > 3) {
    
      192
            uint8x16x4_t rgba0, rgba1;
    
            // Red channel
    
      192
            rgba0.val[2 - b_idx] = r0;
    
      192
            rgba1.val[2 - b_idx] = r1;
    
            // Green channel
    
      192
            rgba0.val[1] = g0;
    
      192
            rgba1.val[1] = g1;
    
            // Blue channel
    
      192
            rgba0.val[b_idx] = b0;
    
      192
            rgba1.val[b_idx] = b1;
    
            // Alpha channel
    
      192
            rgba0.val[3] = vdupq_n_u8(0xFF);
    
      192
            rgba1.val[3] = vdupq_n_u8(0xFF);
    
            // Store RGB pixels to memory.
    
      192
            vst4q_u8(rgbx0, rgba0);
    
      192
            vst4q_u8(rgbx1, rgba1);
    
      192
          } else {
    
      192
            uint8x16x3_t rgba0, rgba1;
    
            // Red channel
    
      192
            rgba0.val[2 - b_idx] = r0;
    
      192
            rgba1.val[2 - b_idx] = r1;
    
            // Green channel
    
      192
            rgba0.val[1] = g0;
    
      192
            rgba1.val[1] = g1;
    
            // Blue channel
    
      192
            rgba0.val[b_idx] = b0;
    
      192
            rgba1.val[b_idx] = b1;
    
            // Store RGB pixels to memory.
    
      192
            vst3q_u8(rgbx0, rgba0);
    
      192
            vst3q_u8(rgbx1, rgba1);
    
      192
          }
    
      384
        }
    
        KLEIDICV_FORCE_INLINE
    
      35516
        static void yuv422_to_rgb(const uint8_t y_rows[2], int32_t u_m128,
    
                                  int32_t v_m128, uint8_t* rgbx_rows[2]) {
    
      71032
          int32_t r_sub_y =
    
      35516
              kUVWeights[kRVWeightIndex] * v_m128 + (1 << (kWeightScale - 1));
    
      106548
          int32_t g_sub_y = kUVWeights[kGUWeightIndex] * u_m128 +
    
      71032
                            kUVWeights[kGVWeightIndex] * v_m128 +
    
                            (1 << (kWeightScale - 1));
    
      71032
          int32_t b_sub_y =
    
      35516
              kUVWeights[kBUWeightIndex] * u_m128 + (1 << (kWeightScale - 1));
    
        24/24✓ Branch 0 taken 3550 times.
✓ Branch 1 taken 7100 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 3550 times.
✓ Branch 7 taken 7100 times.
✓ Branch 8 taken 3550 times.
✓ Branch 9 taken 7100 times.
✓ Branch 10 taken 3550 times.
✓ Branch 11 taken 7100 times.
✓ Branch 12 taken 3550 times.
✓ Branch 13 taken 7100 times.
✓ Branch 14 taken 3550 times.
✓ Branch 15 taken 7100 times.
✓ Branch 16 taken 3550 times.
✓ Branch 17 taken 7100 times.
✓ Branch 18 taken 3550 times.
✓ Branch 19 taken 7100 times.
✓ Branch 20 taken 3550 times.
✓ Branch 21 taken 7100 times.
✓ Branch 22 taken 3550 times.
✓ Branch 23 taken 7100 times.

      106548
          for (size_t selector = 0; selector < 2; ++selector) {
    
      71032
            int32_t y = kYWeight * std::max(y_rows[selector] - 16, 0);
    
      71032
            int32_t r = y + r_sub_y;
    
      71032
            int32_t g = y + g_sub_y;
    
      71032
            int32_t b = y + b_sub_y;
    
      71032
            r >>= kWeightScale;
    
      71032
            g >>= kWeightScale;
    
      71032
            b >>= kWeightScale;
    
      71032
            rgbx_rows[selector][2 - b_idx] = saturating_cast<int32_t, uint8_t>(r);
    
      71032
            rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
    
      71032
            rgbx_rows[selector][b_idx] = saturating_cast<int32_t, uint8_t>(b);
    
            if constexpr (dcn > 3) {
    
      42600
              rgbx_rows[selector][3] = 0xFF;
    
            }
    
      71032
            rgbx_rows[selector] += dcn;
    
      71032
          }
    
      35516
        }
    
      };
    
      KLEIDICV_TARGET_FN_ATTRS
    
      562
      kleidicv_error_t yuv422_to_rgb_u8(const uint8_t* src, size_t src_stride,
    
                                        uint8_t* dst, size_t dst_stride, size_t width,
    
                                        size_t height,
    
                                        kleidicv_color_conversion_t color_format) {
    
        4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 561 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 561 times.

      562
        CHECK_POINTER_AND_STRIDE(src, src_stride, height);
    
        4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 560 times.

      561
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
    
        6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 557 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 555 times.
✓ Branch 4 taken 5 times.
✓ Branch 5 taken 555 times.

      560
        CHECK_IMAGE_SIZE(width, height);
    
        // YUV422 packs pixels in pairs: (Y0, U, Y1, V).
    
        // Therefore, the image width must be at least 2 and always even.
    
        4/4✓ Branch 0 taken 553 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 543 times.

      555
        if (width < 2 || (width % 2) != 0) {
    
      12
          return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
        }
    
        13/13✓ Branch 0 taken 52 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 53 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 1 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 52 times.
✓ Branch 11 taken 52 times.
✓ Branch 12 taken 52 times.

      543
        switch (color_format) {
    
          case KLEIDICV_YUYV_TO_BGR:
    
      53
            return YUV422ToRGBxOrBGRx<0, 1, 0, 3>::yuv2rgbx_operation(
    
      53
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_UYVY_TO_BGR:
    
      1
            return YUV422ToRGBxOrBGRx<0, 0, 1, 3>::yuv2rgbx_operation(
    
      1
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_YVYU_TO_BGR:
    
      1
            return YUV422ToRGBxOrBGRx<0, 3, 0, 3>::yuv2rgbx_operation(
    
      1
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_YUYV_TO_RGB:
    
      52
            return YUV422ToRGBxOrBGRx<2, 1, 0, 3>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_UYVY_TO_RGB:
    
      52
            return YUV422ToRGBxOrBGRx<2, 0, 1, 3>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_YVYU_TO_RGB:
    
      52
            return YUV422ToRGBxOrBGRx<2, 3, 0, 3>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_YUYV_TO_BGRA:
    
      52
            return YUV422ToRGBxOrBGRx<0, 1, 0, 4>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_UYVY_TO_BGRA:
    
      52
            return YUV422ToRGBxOrBGRx<0, 0, 1, 4>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_YVYU_TO_BGRA:
    
      52
            return YUV422ToRGBxOrBGRx<0, 3, 0, 4>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_YUYV_TO_RGBA:
    
      52
            return YUV422ToRGBxOrBGRx<2, 1, 0, 4>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_UYVY_TO_RGBA:
    
      52
            return YUV422ToRGBxOrBGRx<2, 0, 1, 4>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          case KLEIDICV_YVYU_TO_RGBA:
    
      52
            return YUV422ToRGBxOrBGRx<2, 3, 0, 4>::yuv2rgbx_operation(
    
      52
                src, src_stride, dst, dst_stride, width, height);
    
            break;
    
          default:
    
      20
            return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
            break;
    
        }
    
        return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
      562
      }
    
      }  // namespace kleidicv::neon