KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/conversions/yuv420_to_rgb_neon.h
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	144	144	100.0%
Functions:	20	20	100.0%
Branches:	16	16	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #ifndef KLEIDICV_YUV420_TO_RGB_NEON_H
    
      #define KLEIDICV_YUV420_TO_RGB_NEON_H
    
      #include <algorithm>
    
      #include <utility>
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/traits.h"
    
      #include "yuv420_coefficients.h"
    
      namespace kleidicv::neon {
    
      template <bool BGR, bool kAlpha>
    
      class YUV420XToRGBxOrBGRx {
    
       public:
    
        using ScalarType = uint8_t;
    
        using VectorType = uint8x16_t;
    
        int32x4_t y_weight_;
    
        int32x2x2_t uv_weights_;
    
        int32x4_t r_base_, g_base_, b_base_;
    
        int8x16x4_t de_interleave_indices_;
    
        const bool v_first_;
    
        // Returns the number of channels in the output image.
    
      620
        static constexpr size_t output_channels() {
    
      620
          return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
    
        }
    
      8064
        static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
    
      16128
          return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)),
    
      8064
                              vmovn_s32(vshrq_n_s32(b, kWeightScale)));
    
        }
    
        // clang-format off
    
        static constexpr int8_t kDeInterleaveTableIndices[64] = {
    
            /* low and even */
    
            0, -1, -1, -1,  2, -1, -1, -1,  4, -1, -1, -1,  6, -1, -1, -1,
    
            /* high and even */
    
            8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1,
    
            /* low and odd */
    
            1, -1, -1, -1,  3, -1, -1, -1,  5, -1, -1, -1,  7, -1, -1, -1,
    
            /* high and odd */
    
            9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
    
        };
    
        // clang-format on
    
      884
        explicit YUV420XToRGBxOrBGRx(bool v_first)
    
      884
            : y_weight_{vdupq_n_s32(kYWeight)},
    
      884
              uv_weights_{vld2_s32(kUVWeights)},
    
      1768
              r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
    
      884
                                  128 * kUVWeights[kRVWeightIndex])},
    
      1768
              g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
    
      884
                                  128 * (kUVWeights[1] + kUVWeights[2]))},
    
      884
              b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])},
    
      884
              de_interleave_indices_{},
    
      884
              v_first_{v_first} {
    
      884
          neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices,
    
      884
                                        de_interleave_indices_);
    
      884
        }
    
      672
        void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l,
    
                            int32x4_t u_h, int32x4_t v_l, int32x4_t v_h,
    
                            ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
    
          // Y' = saturating(Ya - 16) and widen to 32-bits.
    
      672
          uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16));
    
      672
          uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16));
    
      1344
          uint32x4_t y0_m16_even_l =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0]));
    
      1344
          uint32x4_t y0_m16_even_h =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1]));
    
      1344
          uint32x4_t y0_m16_odd_l =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2]));
    
      1344
          uint32x4_t y0_m16_odd_h =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3]));
    
      1344
          uint32x4_t y1_m16_even_l =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0]));
    
      1344
          uint32x4_t y1_m16_even_h =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1]));
    
      1344
          uint32x4_t y1_m16_odd_l =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2]));
    
      1344
          uint32x4_t y1_m16_odd_h =
    
      672
              vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3]));
    
          // Y = Weight(Y) * Y'
    
      672
          y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_);
    
      672
          y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_);
    
      672
          y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_);
    
      672
          y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_);
    
      672
          y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_);
    
      672
          y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_);
    
      672
          y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_);
    
      672
          y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_);
    
          // Swap U and V planes for YV12 layout.
    
        8/8✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
✓ Branch 2 taken 84 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
✓ Branch 5 taken 84 times.
✓ Branch 6 taken 84 times.
✓ Branch 7 taken 84 times.

      672
          if (v_first_) {
    
      336
            std::swap(u_l, v_l);
    
      336
            std::swap(u_h, v_h);
    
      336
          }
    
          // R - Y = Rbase + Weight(RV) * V =
    
          //         Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
    
      672
          int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0);
    
      672
          int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0);
    
          // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
    
          //         Weight(GU) * ((1 << (SCALE - 1)) - 128) +
    
          //         Weight(GV) * ((1 << (SCALE - 1)) - 128) +
    
          //         Weight(GU) * U + Weight(GV) * V
    
      672
          int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0);
    
      672
          int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0);
    
      672
          g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1);
    
      672
          g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1);
    
          // B - Y = Bbase + Weight(BU) * U =
    
          //         Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
    
      672
          int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1);
    
      672
          int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1);
    
          // R = (R - Y) + Y
    
      672
          int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l);
    
      672
          int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h);
    
      672
          int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l);
    
      672
          int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h);
    
      672
          int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h);
    
      672
          int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h);
    
      672
          int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l);
    
      672
          int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h);
    
      672
          int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l);
    
      672
          int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h);
    
      672
          int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h);
    
      672
          int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h);
    
          // G = (G - Y) + Y
    
      672
          int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l);
    
      672
          int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h);
    
      672
          int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l);
    
      672
          int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h);
    
      672
          int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h);
    
      672
          int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h);
    
      672
          int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l);
    
      672
          int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h);
    
      672
          int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l);
    
      672
          int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h);
    
      672
          int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h);
    
      672
          int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h);
    
          // B = (B - Y) + Y
    
      672
          int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l);
    
      672
          int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h);
    
      672
          int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l);
    
      672
          int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h);
    
      672
          int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h);
    
      672
          int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h);
    
      672
          int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l);
    
      672
          int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h);
    
      672
          int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l);
    
      672
          int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h);
    
      672
          int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h);
    
      672
          int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h);
    
          // Zip even and odd RGB pixels.
    
      672
          uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
    
      672
          uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
    
      672
          uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
    
      672
          uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
    
      672
          uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
    
      672
          uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
    
          if constexpr (kAlpha) {
    
      336
            uint8x16x4_t rgba0, rgba1;
    
            // Red channel
    
      336
            rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
    
      336
            rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
    
            // Green channel
    
      336
            rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
    
      336
            rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
    
            // Blue channel
    
      336
            rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
    
      336
            rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
    
            // Alpha channel
    
      336
            rgba0.val[3] = vdupq_n_u8(0xFF);
    
      336
            rgba1.val[3] = vdupq_n_u8(0xFF);
    
            if constexpr (BGR) {
    
      168
              std::swap(rgba0.val[0], rgba0.val[2]);
    
      168
              std::swap(rgba1.val[0], rgba1.val[2]);
    
            }
    
            // Store RGB pixels to memory.
    
      336
            vst4q_u8(rgbx_row_0, rgba0);
    
      336
            vst4q_u8(rgbx_row_1, rgba1);
    
      336
          } else {
    
      336
            uint8x16x3_t rgb0, rgb1;
    
            // Red channel
    
      336
            rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
    
      336
            rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
    
            // Green channel
    
      336
            rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
    
      336
            rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
    
            // Blue channel
    
      336
            rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
    
      336
            rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
    
            if constexpr (BGR) {
    
      168
              std::swap(rgb0.val[0], rgb0.val[2]);
    
      168
              std::swap(rgb1.val[0], rgb1.val[2]);
    
            }
    
            // Store RGB pixels to memory.
    
      336
            vst3q_u8(rgbx_row_0, rgb0);
    
      336
            vst3q_u8(rgbx_row_1, rgb1);
    
      336
          }
    
      672
        }
    
      22840
        void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128,
    
                            int32_t v_m128, uint8_t *rgbx_rows[2]) {
    
        8/8✓ Branch 0 taken 5710 times.
✓ Branch 1 taken 11420 times.
✓ Branch 2 taken 5710 times.
✓ Branch 3 taken 11420 times.
✓ Branch 4 taken 5710 times.
✓ Branch 5 taken 11420 times.
✓ Branch 6 taken 5710 times.
✓ Branch 7 taken 11420 times.

      68520
          for (size_t selector = 0; selector < 2; ++selector) {
    
      45680
            int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0);
    
      45680
            int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128;
    
      91360
            int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 +
    
      45680
                        kUVWeights[kGVWeightIndex] * v_m128;
    
      45680
            int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128;
    
      45680
            r = rounding_shift_right(r, kWeightScale);
    
      45680
            g = rounding_shift_right(g, kWeightScale);
    
      45680
            b = rounding_shift_right(b, kWeightScale);
    
            if constexpr (BGR) {
    
      22840
              std::swap(r, b);
    
            }
    
      45680
            rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r);
    
      45680
            rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
    
      45680
            rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b);
    
            if constexpr (kAlpha) {
    
      22840
              rgbx_rows[selector][3] = 0xFF;
    
            }
    
      45680
            rgbx_rows[selector] += kAlpha ? 4 : 3;
    
      45680
          }
    
      22840
        }
    
      };
    
      }  // namespace kleidicv::neon
    
      #endif  // KLEIDICV_YUV420_TO_RGB_NEON_H

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#ifndef KLEIDICV_YUV420_TO_RGB_NEON_H
6			#define KLEIDICV_YUV420_TO_RGB_NEON_H
7
8			#include <algorithm>
9			#include <utility>
10
11			#include "kleidicv/kleidicv.h"
12			#include "kleidicv/traits.h"
13			#include "yuv420_coefficients.h"
14
15			namespace kleidicv::neon {
16
17			template <bool BGR, bool kAlpha>
18			class YUV420XToRGBxOrBGRx {
19			public:
20			using ScalarType = uint8_t;
21			using VectorType = uint8x16_t;
22
23			int32x4_t y_weight_;
24			int32x2x2_t uv_weights_;
25			int32x4_t r_base_, g_base_, b_base_;
26			int8x16x4_t de_interleave_indices_;
27			const bool v_first_;
28
29			// Returns the number of channels in the output image.
30		620	static constexpr size_t output_channels() {
31		620	return kAlpha ? /* RGBA / 4 : / RGB */ 3;
32			}
33
34		8064	static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
35		16128	return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)),
36		8064	vmovn_s32(vshrq_n_s32(b, kWeightScale)));
37			}
38
39			// clang-format off
40
41			static constexpr int8_t kDeInterleaveTableIndices[64] = {
42			/* low and even */
43			0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1,
44			/* high and even */
45			8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1,
46			/* low and odd */
47			1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1,
48			/* high and odd */
49			9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
50			};
51
52			// clang-format on
53
54		884	explicit YUV420XToRGBxOrBGRx(bool v_first)
55		884	: y_weight_{vdupq_n_s32(kYWeight)},
56		884	uv_weights_{vld2_s32(kUVWeights)},
57		1768	r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
58		884	128 * kUVWeights[kRVWeightIndex])},
59		1768	g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
60		884	128 * (kUVWeights[1] + kUVWeights[2]))},
61		884	b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])},
62		884	de_interleave_indices_{},
63		884	v_first_{v_first} {
64		884	neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices,
65		884	de_interleave_indices_);
66		884	}
67
68		672	void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l,
69			int32x4_t u_h, int32x4_t v_l, int32x4_t v_h,
70			ScalarType rgbx_row_0, ScalarType rgbx_row_1) {
71			// Y' = saturating(Ya - 16) and widen to 32-bits.
72		672	uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16));
73		672	uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16));
74
75		1344	uint32x4_t y0_m16_even_l =
76		672	vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0]));
77		1344	uint32x4_t y0_m16_even_h =
78		672	vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1]));
79		1344	uint32x4_t y0_m16_odd_l =
80		672	vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2]));
81		1344	uint32x4_t y0_m16_odd_h =
82		672	vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3]));
83
84		1344	uint32x4_t y1_m16_even_l =
85		672	vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0]));
86		1344	uint32x4_t y1_m16_even_h =
87		672	vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1]));
88		1344	uint32x4_t y1_m16_odd_l =
89		672	vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2]));
90		1344	uint32x4_t y1_m16_odd_h =
91		672	vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3]));
92
93			// Y = Weight(Y) * Y'
94		672	y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_);
95		672	y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_);
96		672	y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_);
97		672	y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_);
98
99		672	y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_);
100		672	y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_);
101		672	y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_);
102		672	y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_);
103
104			// Swap U and V planes for YV12 layout.
105	8/8 ✓ Branch 0 taken 84 times. ✓ Branch 1 taken 84 times. ✓ Branch 2 taken 84 times. ✓ Branch 3 taken 84 times. ✓ Branch 4 taken 84 times. ✓ Branch 5 taken 84 times. ✓ Branch 6 taken 84 times. ✓ Branch 7 taken 84 times.	672	if (v_first_) {
106		336	std::swap(u_l, v_l);
107		336	std::swap(u_h, v_h);
108		336	}
109
110			// R - Y = Rbase + Weight(RV) * V =
111			// Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
112		672	int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0);
113		672	int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0);
114
115			// G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
116			// Weight(GU) * ((1 << (SCALE - 1)) - 128) +
117			// Weight(GV) * ((1 << (SCALE - 1)) - 128) +
118			// Weight(GU) * U + Weight(GV) * V
119		672	int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0);
120		672	int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0);
121		672	g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1);
122		672	g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1);
123
124			// B - Y = Bbase + Weight(BU) * U =
125			// Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
126		672	int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1);
127		672	int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1);
128
129			// R = (R - Y) + Y
130		672	int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l);
131		672	int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h);
132		672	int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l);
133		672	int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h);
134		672	int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h);
135		672	int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h);
136
137		672	int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l);
138		672	int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h);
139		672	int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l);
140		672	int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h);
141		672	int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h);
142		672	int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h);
143
144			// G = (G - Y) + Y
145		672	int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l);
146		672	int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h);
147		672	int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l);
148		672	int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h);
149		672	int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h);
150		672	int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h);
151
152		672	int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l);
153		672	int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h);
154		672	int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l);
155		672	int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h);
156		672	int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h);
157		672	int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h);
158
159			// B = (B - Y) + Y
160		672	int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l);
161		672	int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h);
162		672	int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l);
163		672	int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h);
164		672	int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h);
165		672	int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h);
166
167		672	int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l);
168		672	int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h);
169		672	int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l);
170		672	int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h);
171		672	int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h);
172		672	int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h);
173
174			// Zip even and odd RGB pixels.
175		672	uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
176		672	uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
177		672	uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
178		672	uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
179		672	uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
180		672	uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
181
182			if constexpr (kAlpha) {
183		336	uint8x16x4_t rgba0, rgba1;
184			// Red channel
185		336	rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
186		336	rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
187			// Green channel
188		336	rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
189		336	rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
190			// Blue channel
191		336	rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
192		336	rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
193			// Alpha channel
194		336	rgba0.val[3] = vdupq_n_u8(0xFF);
195		336	rgba1.val[3] = vdupq_n_u8(0xFF);
196
197			if constexpr (BGR) {
198		168	std::swap(rgba0.val[0], rgba0.val[2]);
199		168	std::swap(rgba1.val[0], rgba1.val[2]);
200			}
201
202			// Store RGB pixels to memory.
203		336	vst4q_u8(rgbx_row_0, rgba0);
204		336	vst4q_u8(rgbx_row_1, rgba1);
205		336	} else {
206		336	uint8x16x3_t rgb0, rgb1;
207			// Red channel
208		336	rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
209		336	rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
210			// Green channel
211		336	rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
212		336	rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
213			// Blue channel
214		336	rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
215		336	rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
216
217			if constexpr (BGR) {
218		168	std::swap(rgb0.val[0], rgb0.val[2]);
219		168	std::swap(rgb1.val[0], rgb1.val[2]);
220			}
221
222			// Store RGB pixels to memory.
223		336	vst3q_u8(rgbx_row_0, rgb0);
224		336	vst3q_u8(rgbx_row_1, rgb1);
225		336	}
226		672	}
227
228		22840	void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128,
229			int32_t v_m128, uint8_t *rgbx_rows[2]) {
230	8/8 ✓ Branch 0 taken 5710 times. ✓ Branch 1 taken 11420 times. ✓ Branch 2 taken 5710 times. ✓ Branch 3 taken 11420 times. ✓ Branch 4 taken 5710 times. ✓ Branch 5 taken 11420 times. ✓ Branch 6 taken 5710 times. ✓ Branch 7 taken 11420 times.	68520	for (size_t selector = 0; selector < 2; ++selector) {
231		45680	int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0);
232		45680	int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128;
233		91360	int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 +
234		45680	kUVWeights[kGVWeightIndex] * v_m128;
235		45680	int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128;
236
237		45680	r = rounding_shift_right(r, kWeightScale);
238		45680	g = rounding_shift_right(g, kWeightScale);
239		45680	b = rounding_shift_right(b, kWeightScale);
240
241			if constexpr (BGR) {
242		22840	std::swap(r, b);
243			}
244
245		45680	rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r);
246		45680	rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
247		45680	rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b);
248
249			if constexpr (kAlpha) {
250		22840	rgbx_rows[selector][3] = 0xFF;
251			}
252
253		45680	rgbx_rows[selector] += kAlpha ? 4 : 3;
254		45680	}
255		22840	}
256			};
257			} // namespace kleidicv::neon
258
259			#endif // KLEIDICV_YUV420_TO_RGB_NEON_H
260