KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	106	106	100.0%
Functions:	28	28	100.0%
Branches:	96	96	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <utility>
    
      #include "kleidicv/conversions/yuv_420_to_rgb.h"
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/neon.h"
    
      #include "yuv420_to_rgb_neon.h"
    
      namespace kleidicv::neon {
    
      template <bool BGR, bool kAlpha>
    
      class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha>,
    
                                     public UnrollOnce,
    
                                     public TryToAvoidTailLoop {
    
       public:
    
        using VecTraits = neon::VecTraits<uint8_t>;
    
        using ScalarType = VecTraits::ScalarType;
    
        using VectorType = VecTraits::VectorType;
    
        using YUV420XToRGBxOrBGRx<BGR, kAlpha>::de_interleave_indices_;
    
        using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb;
    
        using YUV420XToRGBxOrBGRx<BGR, kAlpha>::v_first_;
    
      380
        explicit YUVpToRGBxOrBGRx(bool is_yv12)
    
      380
            : YUV420XToRGBxOrBGRx<BGR, kAlpha>(is_yv12) {}
    
      288
        void vector_path(VectorType &y0, VectorType &y1, VectorType &y2,
    
                         VectorType &y3, VectorType &u, VectorType &v,
    
                         ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
    
          // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore)
    
          // These are needed to expand each group of 4 bytes into a full 32-bit lane
    
      288
          uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff,
    
                                    2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff};
    
      288
          uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff,
    
                                    6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff};
    
      288
          uint8x16_t index_hi_lo = {8,  0xff, 0xff, 0xff, 9,  0xff, 0xff, 0xff,
    
                                    10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff};
    
      288
          uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff,
    
                                    14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
    
          // Expand each 8-bit channel into 32-bit vectors using table lookup and
    
          // reinterpret
    
      288
          int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo));
    
      288
          int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi));
    
      288
          int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo));
    
      288
          int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi));
    
      288
          int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo));
    
      288
          int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi));
    
      288
          int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo));
    
      288
          int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi));
    
      288
          constexpr size_t step = kAlpha ? 4 * 16 : 3 * 16;
    
      576
          yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0,
    
      288
                         rgbx_row_1);
    
      576
          yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi,
    
      288
                         rgbx_row_0 + step, rgbx_row_1 + step);
    
      288
        }
    
        // Processes inputs which are not long enough to fit a vector.
    
      5184
        void scalar_path(size_t length, const ScalarType *y_row_0,
    
                         const ScalarType *y_row_1, const ScalarType *u_row,
    
                         const ScalarType *v_row, ScalarType *rgbx_row_0,
    
                         ScalarType *rgbx_row_1) {
    
      5184
          const uint8_t *y_rows[2] = {y_row_0, y_row_1};
    
      5184
          uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1};
    
      5184
          int32_t u_m128 = 0, v_m128 = 0;
    
        8/8✓ Branch 0 taken 1296 times.
✓ Branch 1 taken 4646 times.
✓ Branch 2 taken 1296 times.
✓ Branch 3 taken 4646 times.
✓ Branch 4 taken 1296 times.
✓ Branch 5 taken 4646 times.
✓ Branch 6 taken 1296 times.
✓ Branch 7 taken 4646 times.

      23768
          for (size_t index = 0; index < length; ++index) {
    
      18584
            disable_loop_vectorization();
    
            // There is one {U, V} pair for 4 Y values.
    
        8/8✓ Branch 0 taken 2314 times.
✓ Branch 1 taken 2332 times.
✓ Branch 2 taken 2314 times.
✓ Branch 3 taken 2332 times.
✓ Branch 4 taken 2314 times.
✓ Branch 5 taken 2332 times.
✓ Branch 6 taken 2314 times.
✓ Branch 7 taken 2332 times.

      18584
            if ((index % 2) == 0) {
    
      9328
              u_m128 = u_row[0] - 128;
    
      9328
              v_m128 = v_row[0] - 128;
    
      9328
              u_row += 1;
    
      9328
              v_row += 1;
    
        8/8✓ Branch 0 taken 2062 times.
✓ Branch 1 taken 270 times.
✓ Branch 2 taken 2062 times.
✓ Branch 3 taken 270 times.
✓ Branch 4 taken 2062 times.
✓ Branch 5 taken 270 times.
✓ Branch 6 taken 2062 times.
✓ Branch 7 taken 270 times.

      9328
              if (v_first_) {
    
      1080
                std::swap(u_m128, v_m128);
    
      1080
              }
    
      9328
            }
    
      18584
            yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows);
    
      18584
          }
    
      5184
        }
    
      };  // end of class YUVpToRGBxOrBGRx<bool, bool>
    
      using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
    
      using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
    
      using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
    
      using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
    
      template <typename OperationType, typename ScalarType>
    
      380
      kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
    
                                          const ScalarType *src, size_t src_stride,
    
                                          ScalarType *dst, size_t dst_stride,
    
                                          size_t width, size_t height, size_t begin,
    
                                          size_t end) {
    
        16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 91 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 91 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 91 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 91 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 91 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 91 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 91 times.

      380
        CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
    
        16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 87 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 87 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 87 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 87 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 87 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 87 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 87 times.

      364
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
    
        24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 83 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 83 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 79 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 79 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 83 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 79 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 79 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 83 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 79 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 79 times.

      348
        CHECK_IMAGE_SIZE(width, height);
    
        // Pointer to the start of the U plane.
    
        // Since `src` points to a planar YUV buffer, the Y plane comes first,
    
        // occupying `src_stride * height` bytes.
    
      316
        const ScalarType *u = src + src_stride * height;
    
        // Pointer to the start of the V plane.
    
        // The V plane follows the U plane. Both U and V planes are
    
        // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
    
        // are often stored in a single contiguous chroma region in memory. Depending
    
        // on image height and stride, the starting offset of V may require adjustment
    
        // to maintain correct alignment. In particular, when the image height is not
    
        // divisible evenly by 4, the chroma rows may not align perfectly, so a
    
        // fractional offset (in rows) is applied to calculate the V plane position.
    
        // The formula used here accounts for this by adjusting based on row parity,
    
        // assuming consistent memory layout across the Y, U, and V planes.
    
      632
        const ScalarType *v =
    
      316
            u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
    
        // These indices control how U and V row strides are selected across the image
    
        // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
    
        // two luma (Y) rows. However, when the image height is not divisible by 4,
    
        // the mapping between chroma and luma rows becomes asymmetric. Specifically,
    
        // when `height % 4 == 2`, the start of the V plane is offset by one chroma
    
        // row relative to U.
    
        //
    
        // This results in U and V rows being interleaved with a phase difference,
    
        // which must be accounted for during row-wise traversal. To handle this,
    
        // `u_index` and `v_index` are used to alternate the stride selection
    
        // independently for U and V across the loop.
    
        //
    
        // This mechanism ensures that memory access patterns remain correct,
    
        // especially in layouts where U and V share a contiguous buffer with
    
        // alternating strides. Offsetting `v_index` allows the traversal logic to
    
        // maintain correct alignment and prevents misaligned or incorrect reads from
    
        // the chroma buffer.
    
      316
        size_t u_index = 0;
    
      316
        size_t v_index = height % 4 == 2 ? 1 : 0;
    
        // Compute the actual row range in the Y plane (full resolution).
    
        // Since each UV row maps to 2 Y rows, we double the begin/end indices.
    
      316
        size_t row_begin = begin * 2;
    
      316
        size_t row_end = std::min<size_t>(height, end * 2);
    
      316
        size_t row_uv = begin;
    
        // UV stepping pattern: first half of row, then padded second half.
    
        // Needed to match row strides between chroma and luma components.
    
      316
        size_t uv_strides[2] = {width / 2, src_stride - width / 2};
    
        // Calculate starting pointers for Y, U, and V planes at the given stripe
    
        // start.
    
      316
        const ScalarType *y0 = src + row_begin * src_stride;
    
      316
        u = u + row_uv * src_stride / 2;
    
      316
        v = v + row_uv * src_stride / 2;
    
      316
        size_t dcn = operation.output_channels();
    
        8/8✓ Branch 0 taken 1336 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1336 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 1336 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 1336 times.
✓ Branch 7 taken 79 times.

      5660
        for (size_t h = row_begin; h < row_end; h += 2) {
    
      5344
          ScalarType *row0 = dst + dst_stride * h;
    
      5344
          ScalarType *row1 = dst + dst_stride * (h + 1);
    
      5344
          const ScalarType *y1 = y0 + src_stride;
    
          // Guard for odd-height images.
    
          // If the last row in the stripe is unpaired (odd number of rows),
    
          // reuse the previous row pointers to avoid out-of-bounds access.
    
        8/8✓ Branch 0 taken 1296 times.
✓ Branch 1 taken 40 times.
✓ Branch 2 taken 1296 times.
✓ Branch 3 taken 40 times.
✓ Branch 4 taken 1296 times.
✓ Branch 5 taken 40 times.
✓ Branch 6 taken 1296 times.
✓ Branch 7 taken 40 times.

      5344
          if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
    
      160
            row1 = row0;
    
      160
            y1 = y0;
    
      160
          }
    
      5344
          LoopUnroll2 loop{width, kVectorLength};
    
      5632
          loop.unroll_twice([&](size_t index) {
    
      288
            uint8x16_t u_vec = vld1q_u8(u + index / 2);
    
      288
            uint8x16_t v_vec = vld1q_u8(v + index / 2);
    
      288
            uint8x16_t y0_vec = vld1q_u8(y0 + index);
    
      288
            uint8x16_t y1_vec = vld1q_u8(y1 + index);
    
      288
            uint8x16_t y2_vec = vld1q_u8(y0 + index + kVectorLength);
    
      288
            uint8x16_t y3_vec = vld1q_u8(y1 + index + kVectorLength);
    
      576
            operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec,
    
      288
                                  &row0[index * dcn], &row1[index * dcn]);
    
      288
          });
    
      10528
          loop.remaining([&](size_t index, size_t length) {
    
      10368
            operation.scalar_path(length - index, y0 + index, y1 + index,
    
      5184
                                  u + index / 2, v + index / 2, &row0[index * dcn],
    
      5184
                                  &row1[index * dcn]);
    
      5184
          });
    
      5344
          y0 += src_stride * 2;
    
      5344
          u += uv_strides[(u_index++) & 1];
    
      5344
          v += uv_strides[(v_index++) & 1];
    
      5344
        }
    
      316
        return KLEIDICV_OK;
    
      380
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      95
      kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
    
                                              uint8_t *dst, size_t dst_stride,
    
                                              size_t width, size_t height,
    
                                              bool is_yv12, size_t begin,
    
                                              size_t end) {
    
      95
        YUVpToRGB operation{is_yv12};
    
      285
        return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
    
      95
                                  height, begin, end);
    
      95
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      95
      kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
    
                                               uint8_t *dst, size_t dst_stride,
    
                                               size_t width, size_t height,
    
                                               bool is_yv12, size_t begin,
    
                                               size_t end) {
    
      95
        YUVpToRGBA operation{is_yv12};
    
      285
        return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
    
      95
                                  height, begin, end);
    
      95
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      95
      kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride,
    
                                              uint8_t *dst, size_t dst_stride,
    
                                              size_t width, size_t height,
    
                                              bool is_yv12, size_t begin,
    
                                              size_t end) {
    
      95
        YUVpToBGR operation{is_yv12};
    
      285
        return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
    
      95
                                  height, begin, end);
    
      95
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      95
      kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride,
    
                                               uint8_t *dst, size_t dst_stride,
    
                                               size_t width, size_t height,
    
                                               bool is_yv12, size_t begin,
    
                                               size_t end) {
    
      95
        YUVpToBGRA operation{is_yv12};
    
      285
        return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
    
      95
                                  height, begin, end);
    
      95
      }
    
      }  // namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include <utility>
6
7			#include "kleidicv/conversions/yuv_420_to_rgb.h"
8			#include "kleidicv/kleidicv.h"
9			#include "kleidicv/neon.h"
10			#include "yuv420_to_rgb_neon.h"
11
12			namespace kleidicv::neon {
13			template <bool BGR, bool kAlpha>
14			class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, kAlpha>,
15			public UnrollOnce,
16			public TryToAvoidTailLoop {
17			public:
18			using VecTraits = neon::VecTraits<uint8_t>;
19			using ScalarType = VecTraits::ScalarType;
20			using VectorType = VecTraits::VectorType;
21			using YUV420XToRGBxOrBGRx<BGR, kAlpha>::de_interleave_indices_;
22			using YUV420XToRGBxOrBGRx<BGR, kAlpha>::yuv420x_to_rgb;
23			using YUV420XToRGBxOrBGRx<BGR, kAlpha>::v_first_;
24
25		380	explicit YUVpToRGBxOrBGRx(bool is_yv12)
26		380	: YUV420XToRGBxOrBGRx<BGR, kAlpha>(is_yv12) {}
27
28		288	void vector_path(VectorType &y0, VectorType &y1, VectorType &y2,
29			VectorType &y3, VectorType &u, VectorType &v,
30			ScalarType rgbx_row_0, ScalarType rgbx_row_1) {
31			// Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore)
32			// These are needed to expand each group of 4 bytes into a full 32-bit lane
33		288	uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff,
34			2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff};
35
36		288	uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff,
37			6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff};
38
39		288	uint8x16_t index_hi_lo = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff,
40			10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff};
41
42		288	uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff,
43			14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
44
45			// Expand each 8-bit channel into 32-bit vectors using table lookup and
46			// reinterpret
47		288	int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo));
48		288	int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi));
49		288	int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo));
50		288	int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi));
51
52		288	int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo));
53		288	int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi));
54		288	int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo));
55		288	int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi));
56
57		288	constexpr size_t step = kAlpha ? 4 * 16 : 3 * 16;
58
59		576	yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0,
60		288	rgbx_row_1);
61
62		576	yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi,
63		288	rgbx_row_0 + step, rgbx_row_1 + step);
64		288	}
65
66			// Processes inputs which are not long enough to fit a vector.
67		5184	void scalar_path(size_t length, const ScalarType *y_row_0,
68			const ScalarType y_row_1, const ScalarType u_row,
69			const ScalarType v_row, ScalarType rgbx_row_0,
70			ScalarType *rgbx_row_1) {
71		5184	const uint8_t *y_rows[2] = {y_row_0, y_row_1};
72		5184	uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1};
73
74		5184	int32_t u_m128 = 0, v_m128 = 0;
75	8/8 ✓ Branch 0 taken 1296 times. ✓ Branch 1 taken 4646 times. ✓ Branch 2 taken 1296 times. ✓ Branch 3 taken 4646 times. ✓ Branch 4 taken 1296 times. ✓ Branch 5 taken 4646 times. ✓ Branch 6 taken 1296 times. ✓ Branch 7 taken 4646 times.	23768	for (size_t index = 0; index < length; ++index) {
76		18584	disable_loop_vectorization();
77
78			// There is one {U, V} pair for 4 Y values.
79	8/8 ✓ Branch 0 taken 2314 times. ✓ Branch 1 taken 2332 times. ✓ Branch 2 taken 2314 times. ✓ Branch 3 taken 2332 times. ✓ Branch 4 taken 2314 times. ✓ Branch 5 taken 2332 times. ✓ Branch 6 taken 2314 times. ✓ Branch 7 taken 2332 times.	18584	if ((index % 2) == 0) {
80		9328	u_m128 = u_row[0] - 128;
81		9328	v_m128 = v_row[0] - 128;
82		9328	u_row += 1;
83		9328	v_row += 1;
84	8/8 ✓ Branch 0 taken 2062 times. ✓ Branch 1 taken 270 times. ✓ Branch 2 taken 2062 times. ✓ Branch 3 taken 270 times. ✓ Branch 4 taken 2062 times. ✓ Branch 5 taken 270 times. ✓ Branch 6 taken 2062 times. ✓ Branch 7 taken 270 times.	9328	if (v_first_) {
85		1080	std::swap(u_m128, v_m128);
86		1080	}
87		9328	}
88
89		18584	yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows);
90		18584	}
91		5184	}
92			}; // end of class YUVpToRGBxOrBGRx<bool, bool>
93
94			using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
95			using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
96			using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
97			using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
98
99			template <typename OperationType, typename ScalarType>
100		380	kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
101			const ScalarType *src, size_t src_stride,
102			ScalarType *dst, size_t dst_stride,
103			size_t width, size_t height, size_t begin,
104			size_t end) {
105	16/16 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 91 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 91 times. ✓ Branch 4 taken 4 times. ✓ Branch 5 taken 91 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 91 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 91 times. ✓ Branch 10 taken 4 times. ✓ Branch 11 taken 91 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 91 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 91 times.	380	CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
106	16/16 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 87 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 87 times. ✓ Branch 4 taken 4 times. ✓ Branch 5 taken 87 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 87 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 87 times. ✓ Branch 10 taken 4 times. ✓ Branch 11 taken 87 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 87 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 87 times.	364	CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
107	24/24 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 83 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 79 times. ✓ Branch 4 taken 8 times. ✓ Branch 5 taken 79 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 83 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 79 times. ✓ Branch 10 taken 8 times. ✓ Branch 11 taken 79 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 83 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 79 times. ✓ Branch 16 taken 8 times. ✓ Branch 17 taken 79 times. ✓ Branch 18 taken 4 times. ✓ Branch 19 taken 83 times. ✓ Branch 20 taken 4 times. ✓ Branch 21 taken 79 times. ✓ Branch 22 taken 8 times. ✓ Branch 23 taken 79 times.	348	CHECK_IMAGE_SIZE(width, height);
108
109			// Pointer to the start of the U plane.
110			// Since `src` points to a planar YUV buffer, the Y plane comes first,
111			// occupying `src_stride * height` bytes.
112		316	const ScalarType u = src + src_stride height;
113			// Pointer to the start of the V plane.
114			// The V plane follows the U plane. Both U and V planes are
115			// subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
116			// are often stored in a single contiguous chroma region in memory. Depending
117			// on image height and stride, the starting offset of V may require adjustment
118			// to maintain correct alignment. In particular, when the image height is not
119			// divisible evenly by 4, the chroma rows may not align perfectly, so a
120			// fractional offset (in rows) is applied to calculate the V plane position.
121			// The formula used here accounts for this by adjusting based on row parity,
122			// assuming consistent memory layout across the Y, U, and V planes.
123		632	const ScalarType *v =
124		316	u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
125
126			// These indices control how U and V row strides are selected across the image
127			// height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
128			// two luma (Y) rows. However, when the image height is not divisible by 4,
129			// the mapping between chroma and luma rows becomes asymmetric. Specifically,
130			// when `height % 4 == 2`, the start of the V plane is offset by one chroma
131			// row relative to U.
132			//
133			// This results in U and V rows being interleaved with a phase difference,
134			// which must be accounted for during row-wise traversal. To handle this,
135			// `u_index` and `v_index` are used to alternate the stride selection
136			// independently for U and V across the loop.
137			//
138			// This mechanism ensures that memory access patterns remain correct,
139			// especially in layouts where U and V share a contiguous buffer with
140			// alternating strides. Offsetting `v_index` allows the traversal logic to
141			// maintain correct alignment and prevents misaligned or incorrect reads from
142			// the chroma buffer.
143		316	size_t u_index = 0;
144		316	size_t v_index = height % 4 == 2 ? 1 : 0;
145
146			// Compute the actual row range in the Y plane (full resolution).
147			// Since each UV row maps to 2 Y rows, we double the begin/end indices.
148		316	size_t row_begin = begin * 2;
149		316	size_t row_end = std::min<size_t>(height, end * 2);
150		316	size_t row_uv = begin;
151
152			// UV stepping pattern: first half of row, then padded second half.
153			// Needed to match row strides between chroma and luma components.
154		316	size_t uv_strides[2] = {width / 2, src_stride - width / 2};
155
156			// Calculate starting pointers for Y, U, and V planes at the given stripe
157			// start.
158		316	const ScalarType y0 = src + row_begin src_stride;
159		316	u = u + row_uv * src_stride / 2;
160		316	v = v + row_uv * src_stride / 2;
161
162		316	size_t dcn = operation.output_channels();
163	8/8 ✓ Branch 0 taken 1336 times. ✓ Branch 1 taken 79 times. ✓ Branch 2 taken 1336 times. ✓ Branch 3 taken 79 times. ✓ Branch 4 taken 1336 times. ✓ Branch 5 taken 79 times. ✓ Branch 6 taken 1336 times. ✓ Branch 7 taken 79 times.	5660	for (size_t h = row_begin; h < row_end; h += 2) {
164		5344	ScalarType row0 = dst + dst_stride h;
165		5344	ScalarType row1 = dst + dst_stride (h + 1);
166		5344	const ScalarType *y1 = y0 + src_stride;
167
168			// Guard for odd-height images.
169			// If the last row in the stripe is unpaired (odd number of rows),
170			// reuse the previous row pointers to avoid out-of-bounds access.
171	8/8 ✓ Branch 0 taken 1296 times. ✓ Branch 1 taken 40 times. ✓ Branch 2 taken 1296 times. ✓ Branch 3 taken 40 times. ✓ Branch 4 taken 1296 times. ✓ Branch 5 taken 40 times. ✓ Branch 6 taken 1296 times. ✓ Branch 7 taken 40 times.	5344	if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
172		160	row1 = row0;
173		160	y1 = y0;
174		160	}
175
176		5344	LoopUnroll2 loop{width, kVectorLength};
177
178		5632	loop.unroll_twice([&](size_t index) {
179		288	uint8x16_t u_vec = vld1q_u8(u + index / 2);
180		288	uint8x16_t v_vec = vld1q_u8(v + index / 2);
181		288	uint8x16_t y0_vec = vld1q_u8(y0 + index);
182		288	uint8x16_t y1_vec = vld1q_u8(y1 + index);
183		288	uint8x16_t y2_vec = vld1q_u8(y0 + index + kVectorLength);
184		288	uint8x16_t y3_vec = vld1q_u8(y1 + index + kVectorLength);
185
186		576	operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec,
187		288	&row0[index * dcn], &row1[index * dcn]);
188		288	});
189
190		10528	loop.remaining([&](size_t index, size_t length) {
191		10368	operation.scalar_path(length - index, y0 + index, y1 + index,
192		5184	u + index / 2, v + index / 2, &row0[index * dcn],
193		5184	&row1[index * dcn]);
194		5184	});
195
196		5344	y0 += src_stride * 2;
197		5344	u += uv_strides[(u_index++) & 1];
198		5344	v += uv_strides[(v_index++) & 1];
199		5344	}
200
201		316	return KLEIDICV_OK;
202		380	}
203
204			KLEIDICV_TARGET_FN_ATTRS
205		95	kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
206			uint8_t *dst, size_t dst_stride,
207			size_t width, size_t height,
208			bool is_yv12, size_t begin,
209			size_t end) {
210		95	YUVpToRGB operation{is_yv12};
211		285	return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
212		95	height, begin, end);
213		95	}
214
215			KLEIDICV_TARGET_FN_ATTRS
216		95	kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
217			uint8_t *dst, size_t dst_stride,
218			size_t width, size_t height,
219			bool is_yv12, size_t begin,
220			size_t end) {
221		95	YUVpToRGBA operation{is_yv12};
222		285	return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
223		95	height, begin, end);
224		95	}
225
226			KLEIDICV_TARGET_FN_ATTRS
227		95	kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride,
228			uint8_t *dst, size_t dst_stride,
229			size_t width, size_t height,
230			bool is_yv12, size_t begin,
231			size_t end) {
232		95	YUVpToBGR operation{is_yv12};
233		285	return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
234		95	height, begin, end);
235		95	}
236
237			KLEIDICV_TARGET_FN_ATTRS
238		95	kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride,
239			uint8_t *dst, size_t dst_stride,
240			size_t width, size_t height,
241			bool is_yv12, size_t begin,
242			size_t end) {
243		95	YUVpToBGRA operation{is_yv12};
244		285	return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
245		95	height, begin, end);
246		95	}
247			} // namespace kleidicv::neon
248