KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/conversions/rgb_to_yuv_neon.cpp
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	122	122	100.0%
Functions:	24	24	100.0%
Branches:	56	56	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include "kleidicv/conversions/rgb_to_yuv.h"
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/neon.h"
    
      namespace kleidicv::neon {
    
      template <bool BGR, bool kAlpha>
    
      class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
    
       public:
    
        using VecTraits = neon::VecTraits<uint8_t>;
    
        using ScalarType = VecTraits::ScalarType;
    
        using VectorType = VecTraits::VectorType;
    
        using RawSourceVectorType =
    
            typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
    
        explicit RGBToYUVAll() = default;
    
        // Returns the number of channels in the input image.
    
      292
        static constexpr size_t input_channels() {
    
      292
          return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
    
        }
    
      1880
        void vector_path(const ScalarType *src, ScalarType *dst) {
    
      1880
          RawSourceVectorType vsrc;
    
      1880
          int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
    
          if constexpr (kAlpha) {
    
      940
            VecTraits::load(src, vsrc);
    
      940
            uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
    
      940
            uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
    
            if constexpr (BGR) {
    
      470
              b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
    
      470
              b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
    
      470
              r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
    
      470
              r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
    
            } else {
    
      470
              r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
    
      470
              r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
    
      470
              b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
    
      470
              b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
    
            }
    
      940
            uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]);
    
      940
            g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0)));
    
      940
            uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]);
    
      940
            g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0)));
    
      940
          } else {
    
            // Load deinterleaved
    
      940
            vsrc = vld3q_u8(src);
    
      940
            r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
    
      940
            r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
    
      940
            g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
    
      940
            g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
    
      940
            b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
    
      940
            b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
    
          }
    
          // Compute Y value in 32-bit precision
    
      1880
          int16x8_t y_l, y_h;
    
          {
    
      1880
            int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight);
    
      1880
            int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight);
    
      1880
            int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight);
    
      1880
            int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight);
    
      1880
            y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight);
    
      1880
            y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight);
    
      1880
            y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight);
    
      1880
            y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight);
    
      1880
            y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight);
    
      1880
            y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight);
    
      1880
            y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight);
    
      1880
            y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight);
    
      1880
            y_l = combine_scaled_s16(y_ll, y_lh);
    
      1880
            y_h = combine_scaled_s16(y_hl, y_hh);
    
      1880
          }
    
          // Using the 16-bit Y value, calculate U
    
      1880
          int16x8_t u_l, u_h;
    
          {
    
      1880
            int16x8_t uy_l = vqsubq(b_l, y_l);
    
      1880
            int16x8_t uy_h = vqsubq(b_h, y_h);
    
      1880
            int32x4_t u_ll = vdupq_n_s32(half_);
    
      1880
            int32x4_t u_lh = u_ll;
    
      1880
            int32x4_t u_hl = u_ll;
    
      1880
            int32x4_t u_hh = u_ll;
    
      1880
            u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight);
    
      1880
            u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight);
    
      1880
            u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight);
    
      1880
            u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight);
    
      1880
            u_l = combine_scaled_s16(u_ll, u_lh);
    
      1880
            u_h = combine_scaled_s16(u_hl, u_hh);
    
      1880
          }
    
          // Using the 16-bit Y value, calculate V
    
      1880
          int16x8_t v_l, v_h;
    
          {
    
      1880
            int16x8_t vy_l = vqsubq(r_l, y_l);
    
      1880
            int16x8_t vy_h = vqsubq(r_h, y_h);
    
      1880
            int32x4_t v_ll = vdupq_n_s32(half_);
    
      1880
            int32x4_t v_lh = v_ll;
    
      1880
            int32x4_t v_hl = v_ll;
    
      1880
            int32x4_t v_hh = v_ll;
    
      1880
            v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight);
    
      1880
            v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight);
    
      1880
            v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight);
    
      1880
            v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight);
    
      1880
            v_l = combine_scaled_s16(v_ll, v_lh);
    
      1880
            v_h = combine_scaled_s16(v_hl, v_hh);
    
      1880
          }
    
          // Narrow the results to 8 bits
    
      1880
          uint8x16x3_t yuv;
    
      1880
          yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h));
    
      1880
          yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h));
    
      1880
          yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h));
    
          // Store interleaved YUV pixels to memory.
    
      1880
          vst3q_u8(dst, yuv);
    
      1880
        }
    
      412
        void scalar_path(const ScalarType *src, ScalarType *dst) {
    
      824
          int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight +
    
      412
                      src[b_index_] * kBYWeight;
    
      412
          y = rounding_shift_right(y, kWeightScale);
    
      412
          int32_t u = (src[b_index_] - y) * kBUWeight + half_;
    
      412
          u = rounding_shift_right(u, kWeightScale);
    
      412
          int32_t v = (src[r_index_] - y) * kRVWeight + half_;
    
      412
          v = rounding_shift_right(v, kWeightScale);
    
      412
          dst[0] = saturating_cast<int32_t, uint8_t>(y);
    
      412
          dst[1] = saturating_cast<int32_t, uint8_t>(u);
    
      412
          dst[2] = saturating_cast<int32_t, uint8_t>(v);
    
      412
        }
    
       private:
    
        static constexpr size_t r_index_ = BGR ? 2 : 0;
    
        static constexpr size_t g_index_ = 1;
    
        static constexpr size_t b_index_ = BGR ? 0 : 2;
    
        static constexpr size_t step_ = kAlpha ? 4 : 3;
    
        static constexpr uint32_t half_ =
    
            (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
    
      11280
        static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
    
      11280
          return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale);
    
        }
    
      };  // end of class RGBToYUVAll<bool BGR, bool kAlpha>
    
      template <typename OperationType, typename ScalarType>
    
      356
      kleidicv_error_t rgb2yuv_operation(OperationType &operation,
    
                                         const ScalarType *src, size_t src_stride,
    
                                         ScalarType *dst, size_t dst_stride,
    
                                         size_t width, size_t height) {
    
        16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 85 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 85 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 85 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 85 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 85 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 85 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 85 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 85 times.

      356
        CHECK_POINTER_AND_STRIDE(src, src_stride, height);
    
        16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 81 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 81 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 81 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 81 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 81 times.

      340
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
    
        24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 73 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 73 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 73 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 73 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 77 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 73 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 73 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 77 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 73 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 73 times.

      324
        CHECK_IMAGE_SIZE(width, height);
    
      292
        Rectangle rect{width, height};
    
      292
        Rows src_rows{src, src_stride, operation.input_channels()};
    
      292
        Rows dst_rows{dst, dst_stride, 3};
    
      292
        apply_operation_by_rows(operation, rect, src_rows, dst_rows);
    
      292
        return KLEIDICV_OK;
    
      356
      }
    
      using RGBToYUV = RGBToYUVAll<false, false>;
    
      using RGBAToYUV = RGBToYUVAll<false, true>;
    
      using BGRToYUV = RGBToYUVAll<true, false>;
    
      using BGRAToYUV = RGBToYUVAll<true, true>;
    
      KLEIDICV_TARGET_FN_ATTRS
    
      89
      kleidicv_error_t rgb_to_yuv_u8(const uint8_t *src, size_t src_stride,
    
                                     uint8_t *dst, size_t dst_stride, size_t width,
    
                                     size_t height) {
    
      89
        RGBToYUV operation;
    
      267
        return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
    
      89
                                 height);
    
      89
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      89
      kleidicv_error_t rgba_to_yuv_u8(const uint8_t *src, size_t src_stride,
    
                                      uint8_t *dst, size_t dst_stride, size_t width,
    
                                      size_t height) {
    
      89
        RGBAToYUV operation;
    
      267
        return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
    
      89
                                 height);
    
      89
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      89
      kleidicv_error_t bgr_to_yuv_u8(const uint8_t *src, size_t src_stride,
    
                                     uint8_t *dst, size_t dst_stride, size_t width,
    
                                     size_t height) {
    
      89
        BGRToYUV operation;
    
      267
        return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
    
      89
                                 height);
    
      89
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      89
      kleidicv_error_t bgra_to_yuv_u8(const uint8_t *src, size_t src_stride,
    
                                      uint8_t *dst, size_t dst_stride, size_t width,
    
                                      size_t height) {
    
      89
        BGRAToYUV operation;
    
      267
        return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
    
      89
                                 height);
    
      89
      }
    
      }  // namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include "kleidicv/conversions/rgb_to_yuv.h"
6			#include "kleidicv/kleidicv.h"
7			#include "kleidicv/neon.h"
8
9			namespace kleidicv::neon {
10
11			template <bool BGR, bool kAlpha>
12			class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
13			public:
14			using VecTraits = neon::VecTraits<uint8_t>;
15			using ScalarType = VecTraits::ScalarType;
16			using VectorType = VecTraits::VectorType;
17			using RawSourceVectorType =
18			typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
19
20			explicit RGBToYUVAll() = default;
21
22			// Returns the number of channels in the input image.
23		292	static constexpr size_t input_channels() {
24		292	return kAlpha ? /* RGBA / 4 : / RGB */ 3;
25			}
26
27		1880	void vector_path(const ScalarType src, ScalarType dst) {
28		1880	RawSourceVectorType vsrc;
29		1880	int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
30			if constexpr (kAlpha) {
31		940	VecTraits::load(src, vsrc);
32
33		940	uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
34		940	uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
35			if constexpr (BGR) {
36		470	b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
37		470	b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
38		470	r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
39		470	r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
40			} else {
41		470	r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
42		470	r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
43		470	b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
44		470	b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
45			}
46		940	uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]);
47		940	g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0)));
48		940	uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]);
49		940	g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0)));
50		940	} else {
51			// Load deinterleaved
52		940	vsrc = vld3q_u8(src);
53		940	r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
54		940	r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
55		940	g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
56		940	g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
57		940	b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
58		940	b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
59			}
60			// Compute Y value in 32-bit precision
61		1880	int16x8_t y_l, y_h;
62			{
63		1880	int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight);
64		1880	int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight);
65		1880	int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight);
66		1880	int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight);
67
68		1880	y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight);
69		1880	y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight);
70		1880	y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight);
71		1880	y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight);
72
73		1880	y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight);
74		1880	y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight);
75		1880	y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight);
76		1880	y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight);
77
78		1880	y_l = combine_scaled_s16(y_ll, y_lh);
79		1880	y_h = combine_scaled_s16(y_hl, y_hh);
80		1880	}
81
82			// Using the 16-bit Y value, calculate U
83		1880	int16x8_t u_l, u_h;
84			{
85		1880	int16x8_t uy_l = vqsubq(b_l, y_l);
86		1880	int16x8_t uy_h = vqsubq(b_h, y_h);
87
88		1880	int32x4_t u_ll = vdupq_n_s32(half_);
89		1880	int32x4_t u_lh = u_ll;
90		1880	int32x4_t u_hl = u_ll;
91		1880	int32x4_t u_hh = u_ll;
92
93		1880	u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight);
94		1880	u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight);
95		1880	u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight);
96		1880	u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight);
97
98		1880	u_l = combine_scaled_s16(u_ll, u_lh);
99		1880	u_h = combine_scaled_s16(u_hl, u_hh);
100		1880	}
101
102			// Using the 16-bit Y value, calculate V
103		1880	int16x8_t v_l, v_h;
104			{
105		1880	int16x8_t vy_l = vqsubq(r_l, y_l);
106		1880	int16x8_t vy_h = vqsubq(r_h, y_h);
107
108		1880	int32x4_t v_ll = vdupq_n_s32(half_);
109		1880	int32x4_t v_lh = v_ll;
110		1880	int32x4_t v_hl = v_ll;
111		1880	int32x4_t v_hh = v_ll;
112
113		1880	v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight);
114		1880	v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight);
115		1880	v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight);
116		1880	v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight);
117
118		1880	v_l = combine_scaled_s16(v_ll, v_lh);
119		1880	v_h = combine_scaled_s16(v_hl, v_hh);
120		1880	}
121
122			// Narrow the results to 8 bits
123		1880	uint8x16x3_t yuv;
124		1880	yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h));
125		1880	yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h));
126		1880	yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h));
127
128			// Store interleaved YUV pixels to memory.
129		1880	vst3q_u8(dst, yuv);
130		1880	}
131
132		412	void scalar_path(const ScalarType src, ScalarType dst) {
133		824	int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight +
134		412	src[b_index_] * kBYWeight;
135		412	y = rounding_shift_right(y, kWeightScale);
136		412	int32_t u = (src[b_index_] - y) * kBUWeight + half_;
137		412	u = rounding_shift_right(u, kWeightScale);
138		412	int32_t v = (src[r_index_] - y) * kRVWeight + half_;
139		412	v = rounding_shift_right(v, kWeightScale);
140		412	dst[0] = saturating_cast<int32_t, uint8_t>(y);
141		412	dst[1] = saturating_cast<int32_t, uint8_t>(u);
142		412	dst[2] = saturating_cast<int32_t, uint8_t>(v);
143		412	}
144
145			private:
146			static constexpr size_t r_index_ = BGR ? 2 : 0;
147			static constexpr size_t g_index_ = 1;
148			static constexpr size_t b_index_ = BGR ? 0 : 2;
149			static constexpr size_t step_ = kAlpha ? 4 : 3;
150			static constexpr uint32_t half_ =
151			(std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
152
153		11280	static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
154		11280	return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale);
155			}
156			}; // end of class RGBToYUVAll<bool BGR, bool kAlpha>
157
158			template <typename OperationType, typename ScalarType>
159		356	kleidicv_error_t rgb2yuv_operation(OperationType &operation,
160			const ScalarType *src, size_t src_stride,
161			ScalarType *dst, size_t dst_stride,
162			size_t width, size_t height) {
163	16/16 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 85 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 85 times. ✓ Branch 4 taken 4 times. ✓ Branch 5 taken 85 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 85 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 85 times. ✓ Branch 10 taken 4 times. ✓ Branch 11 taken 85 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 85 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 85 times.	356	CHECK_POINTER_AND_STRIDE(src, src_stride, height);
164	16/16 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 81 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 81 times. ✓ Branch 4 taken 4 times. ✓ Branch 5 taken 81 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 81 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 81 times. ✓ Branch 10 taken 4 times. ✓ Branch 11 taken 81 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 81 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 81 times.	340	CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
165	24/24 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 77 times. ✓ Branch 2 taken 4 times. ✓ Branch 3 taken 73 times. ✓ Branch 4 taken 8 times. ✓ Branch 5 taken 73 times. ✓ Branch 6 taken 4 times. ✓ Branch 7 taken 77 times. ✓ Branch 8 taken 4 times. ✓ Branch 9 taken 73 times. ✓ Branch 10 taken 8 times. ✓ Branch 11 taken 73 times. ✓ Branch 12 taken 4 times. ✓ Branch 13 taken 77 times. ✓ Branch 14 taken 4 times. ✓ Branch 15 taken 73 times. ✓ Branch 16 taken 8 times. ✓ Branch 17 taken 73 times. ✓ Branch 18 taken 4 times. ✓ Branch 19 taken 77 times. ✓ Branch 20 taken 4 times. ✓ Branch 21 taken 73 times. ✓ Branch 22 taken 8 times. ✓ Branch 23 taken 73 times.	324	CHECK_IMAGE_SIZE(width, height);
166
167		292	Rectangle rect{width, height};
168		292	Rows src_rows{src, src_stride, operation.input_channels()};
169		292	Rows dst_rows{dst, dst_stride, 3};
170
171		292	apply_operation_by_rows(operation, rect, src_rows, dst_rows);
172		292	return KLEIDICV_OK;
173		356	}
174
175			using RGBToYUV = RGBToYUVAll<false, false>;
176			using RGBAToYUV = RGBToYUVAll<false, true>;
177			using BGRToYUV = RGBToYUVAll<true, false>;
178			using BGRAToYUV = RGBToYUVAll<true, true>;
179
180			KLEIDICV_TARGET_FN_ATTRS
181		89	kleidicv_error_t rgb_to_yuv_u8(const uint8_t *src, size_t src_stride,
182			uint8_t *dst, size_t dst_stride, size_t width,
183			size_t height) {
184		89	RGBToYUV operation;
185		267	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
186		89	height);
187		89	}
188
189			KLEIDICV_TARGET_FN_ATTRS
190		89	kleidicv_error_t rgba_to_yuv_u8(const uint8_t *src, size_t src_stride,
191			uint8_t *dst, size_t dst_stride, size_t width,
192			size_t height) {
193		89	RGBAToYUV operation;
194		267	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
195		89	height);
196		89	}
197
198			KLEIDICV_TARGET_FN_ATTRS
199		89	kleidicv_error_t bgr_to_yuv_u8(const uint8_t *src, size_t src_stride,
200			uint8_t *dst, size_t dst_stride, size_t width,
201			size_t height) {
202		89	BGRToYUV operation;
203		267	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
204		89	height);
205		89	}
206
207			KLEIDICV_TARGET_FN_ATTRS
208		89	kleidicv_error_t bgra_to_yuv_u8(const uint8_t *src, size_t src_stride,
209			uint8_t *dst, size_t dst_stride, size_t width,
210			size_t height) {
211		89	BGRAToYUV operation;
212		267	return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
213		89	height);
214		89	}
215
216			} // namespace kleidicv::neon
217