KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/conversions/rgb_to_yuv444_neon.cpp
Date:	2026-03-05 15:57:40

	Exec	Total	Coverage
Lines:	122	122	100.0%
Functions:	21	21	100.0%
Branches:	61	61	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include "kleidicv/conversions/rgb_to_yuv.h"
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/neon.h"
    
      #include "rgb_to_yuv444_coefficients.h"
    
      namespace kleidicv::neon {
    
      template <bool BGR, bool kAlpha>
    
      class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
    
       public:
    
        using VecTraits = neon::VecTraits<uint8_t>;
    
        using ScalarType = VecTraits::ScalarType;
    
        using VectorType = VecTraits::VectorType;
    
        using RawSourceVectorType =
    
            typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
    
        explicit RGBToYUVAll() = default;
    
        // Returns the number of channels in the input image.
    
      260
        static constexpr size_t input_channels() {
    
      260
          return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
    
        }
    
        KLEIDICV_FORCE_INLINE
    
      1904
        void vector_path(const ScalarType *src, ScalarType *dst) {
    
      1904
          RawSourceVectorType vsrc;
    
      1904
          int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
    
          if constexpr (kAlpha) {
    
      952
            VecTraits::load(src, vsrc);
    
      952
            uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
    
      952
            uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
    
            if constexpr (BGR) {
    
      476
              b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
    
      476
              b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
    
      476
              r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
    
      476
              r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
    
            } else {
    
      476
              r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
    
      476
              r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
    
      476
              b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
    
      476
              b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
    
            }
    
      952
            uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]);
    
      952
            g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0)));
    
      952
            uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]);
    
      952
            g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0)));
    
      952
          } else {
    
            // Load deinterleaved
    
      952
            vsrc = vld3q_u8(src);
    
      952
            r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
    
      952
            r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
    
      952
            g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
    
      952
            g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
    
      952
            b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
    
      952
            b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
    
          }
    
          // Compute Y value in 32-bit precision
    
      1904
          int16x8_t y_l, y_h;
    
          {
    
      1904
            int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight);
    
      1904
            int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight);
    
      1904
            int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight);
    
      1904
            int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight);
    
      1904
            y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight);
    
      1904
            y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight);
    
      1904
            y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight);
    
      1904
            y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight);
    
      1904
            y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight);
    
      1904
            y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight);
    
      1904
            y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight);
    
      1904
            y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight);
    
      1904
            y_l = combine_scaled_s16(y_ll, y_lh);
    
      1904
            y_h = combine_scaled_s16(y_hl, y_hh);
    
      1904
          }
    
          // Using the 16-bit Y value, calculate U
    
      1904
          int16x8_t u_l, u_h;
    
          {
    
      1904
            int16x8_t uy_l = vqsubq(b_l, y_l);
    
      1904
            int16x8_t uy_h = vqsubq(b_h, y_h);
    
      1904
            int32x4_t u_ll = vdupq_n_s32(half_);
    
      1904
            int32x4_t u_lh = u_ll;
    
      1904
            int32x4_t u_hl = u_ll;
    
      1904
            int32x4_t u_hh = u_ll;
    
      1904
            u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight);
    
      1904
            u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight);
    
      1904
            u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight);
    
      1904
            u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight);
    
      1904
            u_l = combine_scaled_s16(u_ll, u_lh);
    
      1904
            u_h = combine_scaled_s16(u_hl, u_hh);
    
      1904
          }
    
          // Using the 16-bit Y value, calculate V
    
      1904
          int16x8_t v_l, v_h;
    
          {
    
      1904
            int16x8_t vy_l = vqsubq(r_l, y_l);
    
      1904
            int16x8_t vy_h = vqsubq(r_h, y_h);
    
      1904
            int32x4_t v_ll = vdupq_n_s32(half_);
    
      1904
            int32x4_t v_lh = v_ll;
    
      1904
            int32x4_t v_hl = v_ll;
    
      1904
            int32x4_t v_hh = v_ll;
    
      1904
            v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight);
    
      1904
            v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight);
    
      1904
            v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight);
    
      1904
            v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight);
    
      1904
            v_l = combine_scaled_s16(v_ll, v_lh);
    
      1904
            v_h = combine_scaled_s16(v_hl, v_hh);
    
      1904
          }
    
          // Narrow the results to 8 bits
    
      1904
          uint8x16x3_t yuv;
    
      1904
          yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h));
    
      1904
          yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h));
    
      1904
          yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h));
    
          // Store interleaved YUV pixels to memory.
    
      1904
          vst3q_u8(dst, yuv);
    
      1904
        }
    
      492
        void scalar_path(const ScalarType *src, ScalarType *dst) {
    
      984
          int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight +
    
      492
                      src[b_index_] * kBYWeight;
    
      492
          y = rounding_shift_right(y, kWeightScale);
    
      492
          int32_t u = (src[b_index_] - y) * kBUWeight + half_;
    
      492
          u = rounding_shift_right(u, kWeightScale);
    
      492
          int32_t v = (src[r_index_] - y) * kRVWeight + half_;
    
      492
          v = rounding_shift_right(v, kWeightScale);
    
      492
          dst[0] = saturating_cast<int32_t, uint8_t>(y);
    
      492
          dst[1] = saturating_cast<int32_t, uint8_t>(u);
    
      492
          dst[2] = saturating_cast<int32_t, uint8_t>(v);
    
      492
        }
    
       private:
    
        static constexpr size_t r_index_ = BGR ? 2 : 0;
    
        static constexpr size_t g_index_ = 1;
    
        static constexpr size_t b_index_ = BGR ? 0 : 2;
    
        static constexpr size_t step_ = kAlpha ? 4 : 3;
    
        static constexpr uint32_t half_ =
    
            (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
    
        KLEIDICV_FORCE_INLINE
    
      11424
        static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
    
      11424
          return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale);
    
        }
    
      };  // end of class RGBToYUVAll<bool BGR, bool kAlpha>
    
      template <typename OperationType, typename ScalarType>
    
      336
      kleidicv_error_t rgb2yuv_operation(OperationType &operation,
    
                                         const ScalarType *src, size_t src_stride,
    
                                         ScalarType *dst, size_t dst_stride,
    
                                         size_t width, size_t height) {
    
        16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 80 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 80 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 80 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 80 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 80 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 80 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 80 times.

      336
        CHECK_POINTER_AND_STRIDE(src, src_stride, height);
    
        16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 76 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 76 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 76 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 76 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 76 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 76 times.

      320
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
    
        24/24✓ Branch 0 taken 6 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 65 times.
✓ Branch 4 taken 11 times.
✓ Branch 5 taken 65 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 70 times.
✓ Branch 8 taken 5 times.
✓ Branch 9 taken 65 times.
✓ Branch 10 taken 11 times.
✓ Branch 11 taken 65 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 70 times.
✓ Branch 14 taken 5 times.
✓ Branch 15 taken 65 times.
✓ Branch 16 taken 11 times.
✓ Branch 17 taken 65 times.
✓ Branch 18 taken 6 times.
✓ Branch 19 taken 70 times.
✓ Branch 20 taken 5 times.
✓ Branch 21 taken 65 times.
✓ Branch 22 taken 11 times.
✓ Branch 23 taken 65 times.

      304
        CHECK_IMAGE_SIZE(width, height);
    
      260
        Rectangle rect{width, height};
    
      260
        Rows src_rows{src, src_stride, operation.input_channels()};
    
      260
        Rows dst_rows{dst, dst_stride, 3};
    
      260
        apply_operation_by_rows(operation, rect, src_rows, dst_rows);
    
      260
        return KLEIDICV_OK;
    
      336
      }
    
      using RGBToYUV = RGBToYUVAll<false, false>;
    
      using RGBAToYUV = RGBToYUVAll<false, true>;
    
      using BGRToYUV = RGBToYUVAll<true, false>;
    
      using BGRAToYUV = RGBToYUVAll<true, true>;
    
      KLEIDICV_TARGET_FN_ATTRS
    
      360
      kleidicv_error_t rgb_to_yuv444_u8(const uint8_t *src, size_t src_stride,
    
                                        uint8_t *dst, size_t dst_stride, size_t width,
    
                                        size_t height,
    
                                        kleidicv_color_conversion_t color_format) {
    
        5/5✓ Branch 0 taken 84 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 84 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.

      360
        switch (color_format) {
    
          case KLEIDICV_RGB_TO_YUV444: {
    
      84
            RGBToYUV operation;
    
      168
            return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
    
      84
                                     width, height);
    
      84
          }
    
          case KLEIDICV_BGR_TO_YUV444: {
    
      84
            BGRToYUV operation;
    
      168
            return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
    
      84
                                     width, height);
    
      84
          }
    
          case KLEIDICV_RGBA_TO_YUV444: {
    
      84
            RGBAToYUV operation;
    
      168
            return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
    
      84
                                     width, height);
    
      84
          }
    
          case KLEIDICV_BGRA_TO_YUV444: {
    
      84
            BGRAToYUV operation;
    
      168
            return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
    
      84
                                     width, height);
    
      84
          }
    
          default:
    
      24
            return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
        }
    
        return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
      360
      }
    
      }  // namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include "kleidicv/conversions/rgb_to_yuv.h"
6			#include "kleidicv/kleidicv.h"
7			#include "kleidicv/neon.h"
8			#include "rgb_to_yuv444_coefficients.h"
9			namespace kleidicv::neon {
10
11			template <bool BGR, bool kAlpha>
12			class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
13			public:
14			using VecTraits = neon::VecTraits<uint8_t>;
15			using ScalarType = VecTraits::ScalarType;
16			using VectorType = VecTraits::VectorType;
17			using RawSourceVectorType =
18			typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
19
20			explicit RGBToYUVAll() = default;
21
22			// Returns the number of channels in the input image.
23		260	static constexpr size_t input_channels() {
24		260	return kAlpha ? /* RGBA / 4 : / RGB */ 3;
25			}
26
27			KLEIDICV_FORCE_INLINE
28		1904	void vector_path(const ScalarType src, ScalarType dst) {
29		1904	RawSourceVectorType vsrc;
30		1904	int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
31			if constexpr (kAlpha) {
32		952	VecTraits::load(src, vsrc);
33
34		952	uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
35		952	uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
36			if constexpr (BGR) {
37		476	b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
38		476	b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
39		476	r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
40		476	r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
41			} else {
42		476	r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
43		476	r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
44		476	b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
45		476	b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
46			}
47		952	uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]);
48		952	g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0)));
49		952	uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]);
50		952	g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0)));
51		952	} else {
52			// Load deinterleaved
53		952	vsrc = vld3q_u8(src);
54		952	r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
55		952	r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
56		952	g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
57		952	g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
58		952	b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
59		952	b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
60			}
61			// Compute Y value in 32-bit precision
62		1904	int16x8_t y_l, y_h;
63			{
64		1904	int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight);
65		1904	int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight);
66		1904	int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight);
67		1904	int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight);
68
69		1904	y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight);
70		1904	y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight);
71		1904	y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight);
72		1904	y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight);
73
74		1904	y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight);
75		1904	y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight);
76		1904	y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight);
77		1904	y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight);
78
79		1904	y_l = combine_scaled_s16(y_ll, y_lh);
80		1904	y_h = combine_scaled_s16(y_hl, y_hh);
81		1904	}
82
83			// Using the 16-bit Y value, calculate U
84		1904	int16x8_t u_l, u_h;
85			{
86		1904	int16x8_t uy_l = vqsubq(b_l, y_l);
87		1904	int16x8_t uy_h = vqsubq(b_h, y_h);
88
89		1904	int32x4_t u_ll = vdupq_n_s32(half_);
90		1904	int32x4_t u_lh = u_ll;
91		1904	int32x4_t u_hl = u_ll;
92		1904	int32x4_t u_hh = u_ll;
93
94		1904	u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight);
95		1904	u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight);
96		1904	u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight);
97		1904	u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight);
98
99		1904	u_l = combine_scaled_s16(u_ll, u_lh);
100		1904	u_h = combine_scaled_s16(u_hl, u_hh);
101		1904	}
102
103			// Using the 16-bit Y value, calculate V
104		1904	int16x8_t v_l, v_h;
105			{
106		1904	int16x8_t vy_l = vqsubq(r_l, y_l);
107		1904	int16x8_t vy_h = vqsubq(r_h, y_h);
108
109		1904	int32x4_t v_ll = vdupq_n_s32(half_);
110		1904	int32x4_t v_lh = v_ll;
111		1904	int32x4_t v_hl = v_ll;
112		1904	int32x4_t v_hh = v_ll;
113
114		1904	v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight);
115		1904	v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight);
116		1904	v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight);
117		1904	v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight);
118
119		1904	v_l = combine_scaled_s16(v_ll, v_lh);
120		1904	v_h = combine_scaled_s16(v_hl, v_hh);
121		1904	}
122
123			// Narrow the results to 8 bits
124		1904	uint8x16x3_t yuv;
125		1904	yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h));
126		1904	yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h));
127		1904	yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h));
128
129			// Store interleaved YUV pixels to memory.
130		1904	vst3q_u8(dst, yuv);
131		1904	}
132
133		492	void scalar_path(const ScalarType src, ScalarType dst) {
134		984	int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight +
135		492	src[b_index_] * kBYWeight;
136		492	y = rounding_shift_right(y, kWeightScale);
137		492	int32_t u = (src[b_index_] - y) * kBUWeight + half_;
138		492	u = rounding_shift_right(u, kWeightScale);
139		492	int32_t v = (src[r_index_] - y) * kRVWeight + half_;
140		492	v = rounding_shift_right(v, kWeightScale);
141		492	dst[0] = saturating_cast<int32_t, uint8_t>(y);
142		492	dst[1] = saturating_cast<int32_t, uint8_t>(u);
143		492	dst[2] = saturating_cast<int32_t, uint8_t>(v);
144		492	}
145
146			private:
147			static constexpr size_t r_index_ = BGR ? 2 : 0;
148			static constexpr size_t g_index_ = 1;
149			static constexpr size_t b_index_ = BGR ? 0 : 2;
150			static constexpr size_t step_ = kAlpha ? 4 : 3;
151			static constexpr uint32_t half_ =
152			(std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
153
154			KLEIDICV_FORCE_INLINE
155		11424	static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
156		11424	return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale);
157			}
158			}; // end of class RGBToYUVAll<bool BGR, bool kAlpha>
159
160			template <typename OperationType, typename ScalarType>
161		336	kleidicv_error_t rgb2yuv_operation(OperationType &operation,
162			const ScalarType *src, size_t src_stride,
163			ScalarType *dst, size_t dst_stride,
164			size_t width, size_t height) {
165	16/16 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 80 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 80 times. ✓ Branch 4 taken 4 times. ✓ Branch 5 taken 80 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 80 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 80 times. ✓ Branch 10 taken 4 times. ✓ Branch 11 taken 80 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 80 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 80 times.	336	CHECK_POINTER_AND_STRIDE(src, src_stride, height);
166	16/16 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 76 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 76 times. ✓ Branch 4 taken 4 times. ✓ Branch 5 taken 76 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 76 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 76 times. ✓ Branch 10 taken 4 times. ✓ Branch 11 taken 76 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 76 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 76 times.	320	CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
167	24/24 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 70 times. ✓ Branch 2 taken 5 times. ✓ Branch 3 taken 65 times. ✓ Branch 4 taken 11 times. ✓ Branch 5 taken 65 times. ✓ Branch 6 taken 6 times. ✓ Branch 7 taken 70 times. ✓ Branch 8 taken 5 times. ✓ Branch 9 taken 65 times. ✓ Branch 10 taken 11 times. ✓ Branch 11 taken 65 times. ✓ Branch 12 taken 6 times. ✓ Branch 13 taken 70 times. ✓ Branch 14 taken 5 times. ✓ Branch 15 taken 65 times. ✓ Branch 16 taken 11 times. ✓ Branch 17 taken 65 times. ✓ Branch 18 taken 6 times. ✓ Branch 19 taken 70 times. ✓ Branch 20 taken 5 times. ✓ Branch 21 taken 65 times. ✓ Branch 22 taken 11 times. ✓ Branch 23 taken 65 times.	304	CHECK_IMAGE_SIZE(width, height);
168
169		260	Rectangle rect{width, height};
170		260	Rows src_rows{src, src_stride, operation.input_channels()};
171		260	Rows dst_rows{dst, dst_stride, 3};
172
173		260	apply_operation_by_rows(operation, rect, src_rows, dst_rows);
174		260	return KLEIDICV_OK;
175		336	}
176
177			using RGBToYUV = RGBToYUVAll<false, false>;
178			using RGBAToYUV = RGBToYUVAll<false, true>;
179			using BGRToYUV = RGBToYUVAll<true, false>;
180			using BGRAToYUV = RGBToYUVAll<true, true>;
181
182			KLEIDICV_TARGET_FN_ATTRS
183		360	kleidicv_error_t rgb_to_yuv444_u8(const uint8_t *src, size_t src_stride,
184			uint8_t *dst, size_t dst_stride, size_t width,
185			size_t height,
186			kleidicv_color_conversion_t color_format) {
187	5/5 ✓ Branch 0 taken 84 times. ✓ Branch 1 taken 24 times. ✓ Branch 2 taken 84 times. ✓ Branch 3 taken 84 times. ✓ Branch 4 taken 84 times.	360	switch (color_format) {
188			case KLEIDICV_RGB_TO_YUV444: {
189		84	RGBToYUV operation;
190		168	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
191		84	width, height);
192		84	}
193
194			case KLEIDICV_BGR_TO_YUV444: {
195		84	BGRToYUV operation;
196		168	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
197		84	width, height);
198		84	}
199
200			case KLEIDICV_RGBA_TO_YUV444: {
201		84	RGBAToYUV operation;
202		168	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
203		84	width, height);
204		84	}
205
206			case KLEIDICV_BGRA_TO_YUV444: {
207		84	BGRAToYUV operation;
208		168	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride,
209		84	width, height);
210		84	}
211
212			default:
213		24	return KLEIDICV_ERROR_NOT_IMPLEMENTED;
214			}
215
216			return KLEIDICV_ERROR_NOT_IMPLEMENTED;
217		360	}
218
219			} // namespace kleidicv::neon
220