KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/conversions/rgb_to_yuv420_neon.h
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	180	180	100.0%
Functions:	104	104	100.0%
Branches:	80	80	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #ifndef KLEIDICV_RGB_TO_YUV420_H
    
      #define KLEIDICV_RGB_TO_YUV420_H
    
      #include <algorithm>
    
      #include <utility>
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/neon.h"
    
      #include "yuv420_coefficients.h"
    
      namespace kleidicv::neon {
    
      template <bool kAlpha, bool RGB, bool kInterleave>
    
      class RGBxorBGRxToYUV420 {
    
       public:
    
      376
        static kleidicv_error_t rgb2yuv420_operation(
    
            const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
    
            uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
    
            bool v_first, size_t begin, size_t end) {
    
      376
          size_t row_begin = begin * 2;
    
      376
          size_t row_end = std::min<size_t>(height, end * 2);
    
      376
          const uint8_t *src_row = nullptr;
    
      376
          uint8_t *y_row = nullptr;
    
      376
          uint8_t *u_row = nullptr;
    
      376
          uint8_t *v_row = nullptr;
    
        8/8✓ Branch 0 taken 94 times.
✓ Branch 1 taken 4810 times.
✓ Branch 2 taken 94 times.
✓ Branch 3 taken 4810 times.
✓ Branch 4 taken 94 times.
✓ Branch 5 taken 4810 times.
✓ Branch 6 taken 94 times.
✓ Branch 7 taken 4810 times.

      19616
          for (size_t h = row_begin; h < row_end; h++) {
    
      19240
            src_row = src + src_stride * h;
    
      19240
            y_row = y_dst + y_stride * h;
    
      19240
            bool evenRow = (h & 1) == 0;
    
        8/8✓ Branch 0 taken 2382 times.
✓ Branch 1 taken 2428 times.
✓ Branch 2 taken 2382 times.
✓ Branch 3 taken 2428 times.
✓ Branch 4 taken 2382 times.
✓ Branch 5 taken 2428 times.
✓ Branch 6 taken 2382 times.
✓ Branch 7 taken 2428 times.

      19240
            if (evenRow) {
    
              if constexpr (kInterleave) {
    
      4912
                u_row = uv_dst + uv_stride * (h / 2);
    
              } else {
    
      4800
                u_row =
    
      4800
                    uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2);
    
                // Pointer to the start of the V plane.
    
                // The V plane follows the U plane. Both U and V planes are
    
                // subsampled at a 2:1 vertical ratio (i.e., each has height / 2
    
                // rows) and stored in a single contiguous chroma region.
    
                // Depending on image height and stride, the starting offset
    
                // of V may require adjustment, so a
    
                // fractional offset (in rows) is applied to calculate the V plane
    
                // position.
    
      9600
                v_row = uv_dst + uv_stride * ((h + height + 1) / 4) +
    
      4800
                        (((h + height + 1) / 2) % 2) * ((width + 1) / 2);
    
              }
    
      9712
            }
    
      19240
            LoopUnroll2<TryToAvoidTailLoop> loop{width, kVectorLength};
    
      30040
            loop.unroll_twice([&](size_t index) {
    
      10800
              vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow);
    
      10800
            });
    
      77552
            loop.tail([&](size_t index) {
    
      116624
              scalar_path(src_row, y_row, u_row, v_row, v_first, index, width,
    
      58312
                          evenRow);
    
      58312
            });
    
      19240
          }
    
      376
          return KLEIDICV_OK;
    
      376
        }
    
       private:
    
      10800
        static void vector_path_2x(const uint8_t *src_row, uint8_t *y_row,
    
                                   uint8_t *u_row, uint8_t *v_row, const bool v_first,
    
                                   const size_t index, const bool evenRow) {
    
      10800
          uint32x4_t r0[4], g0[4], b0[4], r1[4], g1[4], b1[4];
    
      10800
          load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, index);
    
      10800
          uint8x16_t y0 = rgb_to_y(r0, g0, b0);
    
      10800
          uint8x16_t y1 = rgb_to_y(r1, g1, b1);
    
      10800
          vst1q_u8(y_row + index, y0);
    
      10800
          vst1q_u8(y_row + index + kVectorLength, y1);
    
          // U and V are subsampled by a factor of 2 in both horizontal and vertical
    
          // directions for YUV420 format. Therefore, we only compute U and V from
    
          // even rows and even columns. When the input RGB image has an odd width or
    
          // height, the chroma (U and V) dimensions are rounded up. For example, if
    
          // the height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2
    
          // = 4.5 -> rounded up). The same rounding is applied for width.
    
        8/8✓ Branch 0 taken 1300 times.
✓ Branch 1 taken 1400 times.
✓ Branch 2 taken 1300 times.
✓ Branch 3 taken 1400 times.
✓ Branch 4 taken 1300 times.
✓ Branch 5 taken 1400 times.
✓ Branch 6 taken 1300 times.
✓ Branch 7 taken 1400 times.

      10800
          if (evenRow) {
    
      5600
            uint8x16x2_t uv;
    
      5600
            int32x4_t r_even[4] = {r0[0], r0[2], r1[0], r1[2]};
    
      5600
            int32x4_t g_even[4] = {g0[0], g0[2], g1[0], g1[2]};
    
      5600
            int32x4_t b_even[4] = {b0[0], b0[2], b1[0], b1[2]};
    
      5600
            rgb_to_uv_2x(r_even, g_even, b_even, uv.val[0], uv.val[1]);
    
        8/8✓ Branch 0 taken 700 times.
✓ Branch 1 taken 700 times.
✓ Branch 2 taken 700 times.
✓ Branch 3 taken 700 times.
✓ Branch 4 taken 700 times.
✓ Branch 5 taken 700 times.
✓ Branch 6 taken 700 times.
✓ Branch 7 taken 700 times.

      5600
            if (v_first) {
    
      2800
              std::swap(uv.val[0], uv.val[1]);
    
      2800
            }
    
            if constexpr (kInterleave) {
    
      2800
              vst2q_u8(u_row + index, uv);
    
            } else {
    
      2800
              vst1q_u8(u_row + index / 2, uv.val[0]);
    
      2800
              vst1q_u8(v_row + index / 2, uv.val[1]);
    
            }
    
      5600
          }
    
      10800
        }
    
      58312
        static void scalar_path(const uint8_t *src_row, uint8_t *y_row,
    
                                uint8_t *u_row, uint8_t *v_row, const bool v_first,
    
                                size_t index, const size_t length,
    
                                const bool evenRow) {
    
      58312
          const size_t u_index_ = v_first;
    
      58312
          const size_t v_index_ = !v_first;
    
        8/8✓ Branch 0 taken 65246 times.
✓ Branch 1 taken 14578 times.
✓ Branch 2 taken 65246 times.
✓ Branch 3 taken 14578 times.
✓ Branch 4 taken 65246 times.
✓ Branch 5 taken 14578 times.
✓ Branch 6 taken 65246 times.
✓ Branch 7 taken 14578 times.

      319296
          for (; index < length; index += 1) {
    
      260984
            uint8_t b0{}, g0{}, r0{};
    
      260984
            bool evenCol = (index & 1) == 0;
    
      260984
            b0 = src_row[index * scn + b_index_];
    
      260984
            g0 = src_row[index * scn + g_index_];
    
      260984
            r0 = src_row[index * scn + r_index_];
    
      260984
            uint8_t y0 = rgb_to_y(r0, g0, b0);
    
      260984
            y_row[index] = y0;
    
            // U and V are subsampled by a factor of 2 in both horizontal and vertical
    
            // directions
    
            // for YUV420 format. Therefore, we only compute U and V from even rows
    
            // and even columns. When the input RGB image has an odd width or height,
    
            // the chroma (U and V) dimensions are rounded up. For example, if the
    
            // height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 = 4.5
    
            // -> rounded up). The same rounding is applied for width.
    
        16/16✓ Branch 0 taken 33384 times.
✓ Branch 1 taken 31862 times.
✓ Branch 2 taken 18092 times.
✓ Branch 3 taken 15292 times.
✓ Branch 4 taken 33384 times.
✓ Branch 5 taken 31862 times.
✓ Branch 6 taken 18092 times.
✓ Branch 7 taken 15292 times.
✓ Branch 8 taken 33384 times.
✓ Branch 9 taken 31862 times.
✓ Branch 10 taken 18092 times.
✓ Branch 11 taken 15292 times.
✓ Branch 12 taken 33384 times.
✓ Branch 13 taken 31862 times.
✓ Branch 14 taken 18092 times.
✓ Branch 15 taken 15292 times.

      260984
            if (evenRow && evenCol) {
    
      61168
              uint8_t uv[2] = {0, 0};
    
      61168
              rgb_to_uv(r0, g0, b0, uv);
    
              if constexpr (kInterleave) {
    
      37248
                u_row[index] = uv[u_index_];
    
      37248
                u_row[index + 1] = uv[v_index_];
    
              } else {
    
      23920
                u_row[(index + 1) / 2] = uv[u_index_];
    
      23920
                v_row[(index + 1) / 2] = uv[v_index_];
    
              }
    
      61168
            }
    
      260984
          }
    
      58312
        }
    
      260984
        static uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) {
    
      260984
          const int kShifted16 = (16 << kWeightScale);
    
      260984
          const int kHalfShift = (1 << (kWeightScale - 1));
    
      521968
          int yy =
    
      260984
              kRYWeight * r + kGYWeight * g + kBYWeight * b + kHalfShift + kShifted16;
    
      521968
          return std::clamp(yy >> kWeightScale, 0, 0xff);
    
      260984
        }
    
      21600
        static uint8x16_t rgb_to_y(const uint32x4_t r[4], const uint32x4_t g[4],
    
                                   const uint32x4_t b[4]) {
    
      21600
          const int kShifted16 = (16 << kWeightScale);
    
      21600
          const int kHalfShift = (1 << (kWeightScale - 1));
    
          // Y = kR*R + kG*G + kB*B + rounding bias
    
      21600
          uint32x4_t v_kRYWeight = vdupq_n_u32(kRYWeight);
    
      21600
          uint32x4_t v_kGYWeight = vdupq_n_u32(kGYWeight);
    
      21600
          uint32x4_t v_kBYWeight = vdupq_n_u32(kBYWeight);
    
      21600
          uint32x4_t y[4];
    
          KLEIDICV_FORCE_LOOP_UNROLL
    
        8/8✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 21600 times.
✓ Branch 2 taken 5400 times.
✓ Branch 3 taken 21600 times.
✓ Branch 4 taken 5400 times.
✓ Branch 5 taken 21600 times.
✓ Branch 6 taken 5400 times.
✓ Branch 7 taken 21600 times.

      108000
          for (int i = 0; i < 4; i++) {
    
      86400
            y[i] = vdupq_n_u32(kHalfShift + kShifted16);
    
      86400
            y[i] = vmlaq_u32(y[i], r[i], v_kRYWeight);
    
      86400
            y[i] = vmlaq_u32(y[i], g[i], v_kGYWeight);
    
      86400
            y[i] = vmlaq_u32(y[i], b[i], v_kBYWeight);
    
      86400
          }
    
      43200
          return normalize_and_pack_y(y);
    
      21600
        }
    
      61168
        static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t uv[2]) {
    
      61168
          const int kHalfShift = (1 << (kWeightScale - 1));
    
      61168
          const int kShifted128 = (128 << kWeightScale);
    
      61168
          int uu = kRUWeight * r + kGUWeight * g + kBUWeight * b + kHalfShift +
    
                   kShifted128;
    
      61168
          int vv = kBUWeight * r + kGVWeight * g + kBVWeight * b + kHalfShift +
    
                   kShifted128;
    
      61168
          uv[0] = std::clamp(uu >> kWeightScale, 0, 0xff);
    
      61168
          uv[1] = std::clamp(vv >> kWeightScale, 0, 0xff);
    
      61168
        }
    
      11200
        static uint8x16_t compute_u_or_v_2x(const int32x4_t r[4],
    
                                            const int32x4_t g[4],
    
                                            const int32x4_t b[4], const int r_coeff,
    
                                            const int g_coeff, const int b_coeff) {
    
          // Constants for U/V calculation
    
      11200
          const int kHalfShift = (1 << (kWeightScale - 1));
    
      11200
          const int kShifted128 = (128 << kWeightScale);
    
      11200
          int32x4_t v_r_coeff = vdupq_n_s32(r_coeff);
    
      11200
          int32x4_t v_g_coeff = vdupq_n_s32(g_coeff);
    
      11200
          int32x4_t v_b_coeff = vdupq_n_s32(b_coeff);
    
      11200
          int32x4_t uv[4];
    
          KLEIDICV_FORCE_LOOP_UNROLL
    
        8/8✓ Branch 0 taken 2800 times.
✓ Branch 1 taken 11200 times.
✓ Branch 2 taken 2800 times.
✓ Branch 3 taken 11200 times.
✓ Branch 4 taken 2800 times.
✓ Branch 5 taken 11200 times.
✓ Branch 6 taken 2800 times.
✓ Branch 7 taken 11200 times.

      56000
          for (int i = 0; i < 4; i++) {
    
      44800
            uv[i] = vdupq_n_s32(kHalfShift + kShifted128);
    
      44800
            uv[i] = vmlaq_s32(uv[i], r[i], v_r_coeff);
    
      44800
            uv[i] = vmlaq_s32(uv[i], g[i], v_g_coeff);
    
      44800
            uv[i] = vmlaq_s32(uv[i], b[i], v_b_coeff);
    
      44800
          }
    
      22400
          return normalize_and_pack_u_or_v(uv);
    
      11200
        }
    
      5600
        static void rgb_to_uv_2x(const int32x4_t r[4], const int32x4_t g[4],
    
                                 const int32x4_t b[4], uint8x16_t &u, uint8x16_t &v) {
    
          // ---------------- U (Cb) Component ----------------
    
          // U = R * kRU + G * kGU + B * kBU + bias
    
      5600
          u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight);
    
          // ---------------- V (Cr) Component ----------------
    
          // V = R * kBU + G * kGV + B * kBV + bias
    
      5600
          v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight);
    
      5600
        }
    
      21600
        static uint8x16_t normalize_and_pack_y(uint32x4_t vec[4]) {
    
          // The y_index table selects the correct output order after normalization.
    
          // When we load and separate the RGB values for UV calculation, we
    
          // deinterleave them into even and odd components. As a result, the
    
          // processed values are stored in two separate vectors. During
    
          // normalization, we need to interleave them again to produce the final
    
          // contiguous output, and this index pattern achieves that.
    
      21600
          uint8x16_t y_index = {1, 17, 3,  19, 5,  21, 7,  23,
    
                                9, 25, 11, 27, 13, 29, 15, 31};
    
          // Normalize down by right-shifting the fixed-point result
    
          // vshrn_n can only shift by an immediate value between 1 and 16.
    
          // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
    
          // bits. This ensures that the most relevant 8-bit result lies in the second
    
          // byte of each 16-bit element. As a result, the lookup tables  are
    
          // constructed with only odd indices to extract the second byte from each
    
          // element.
    
      21600
          uint16x4_t tmp_lo_lo = vshrn_n_u32(vec[0], kWeightScale - 8);
    
      43200
          uint16x8_t tmp_lo_hi =
    
      21600
              vshrn_high_n_u32(tmp_lo_lo, vec[2], kWeightScale - 8);
    
      21600
          uint16x4_t tmp_hi_lo = vshrn_n_u32(vec[1], kWeightScale - 8);
    
      43200
          uint16x8_t tmp_hi_hi =
    
      21600
              vshrn_high_n_u32(tmp_hi_lo, vec[3], kWeightScale - 8);
    
      21600
          uint8x16x2_t tmp;
    
      21600
          tmp.val[0] = vreinterpretq_u8(tmp_lo_hi);  // 0, 2, 4, 6, 8, 10, 12, 14
    
      21600
          tmp.val[1] = vreinterpretq_u8(tmp_hi_hi);  // 1, 3, 5, 7, 9, 11, 13, 15
    
      21600
          uint8x16_t output = vqtbl2q_u8(tmp, y_index);
    
      43200
          return output;
    
      21600
        }
    
      11200
        static uint8x16_t normalize_and_pack_u_or_v(int32x4_t vec[4]) {
    
          // The uv_index table is used to finalize the order of U and V values.
    
          // Unlike the Y component, we don't need to interleave even and odd elements
    
          // manually. This is because the first vector already contains even-indexed
    
          // values from the lower RGB block, and the second vector contains
    
          // even-indexed values from the higher RGB block. As a result, the values
    
          // are already sorted in the correct order for output.
    
      11200
          uint8x16_t uv_index = {1,  3,  5,  7,  9,  11, 13, 15,
    
                                 17, 19, 21, 23, 25, 27, 29, 31};
    
          // Normalize down by right-shifting the fixed-point result
    
          // vshrn_n can only shift by an immediate value between 1 and 16.
    
          // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
    
          // bits. This ensures that the most relevant 8-bit result lies in the second
    
          // byte of each 16-bit element. As a result, the lookup tables  are
    
          // constructed with only odd indices to extract the second byte from each
    
          // element.
    
      11200
          int16x4_t tmp_lo_lo = vshrn_n_s32(vec[0], kWeightScale - 8);
    
      11200
          int16x8_t tmp_lo_hi = vshrn_high_n_s32(tmp_lo_lo, vec[1], kWeightScale - 8);
    
      11200
          int16x4_t tmp_hi_lo = vshrn_n_s32(vec[2], kWeightScale - 8);
    
      11200
          int16x8_t tmp_hi_hi = vshrn_high_n_s32(tmp_hi_lo, vec[3], kWeightScale - 8);
    
      11200
          uint8x16x2_t tmp;
    
      11200
          tmp.val[0] = vreinterpretq_u8(
    
      11200
              tmp_lo_hi);  // 0, 2, 4, 6, 8, 10, 12, 14 for the first vector
    
      11200
          tmp.val[1] = vreinterpretq_u8(
    
      11200
              tmp_hi_hi);  // 0, 2, 4, 6, 8, 10, 12, 14 for the second vector
    
      11200
          uint8x16_t output = vqtbl2q_u8(tmp, uv_index);
    
      22400
          return output;
    
      11200
        }
    
      10800
        static void load_rgb_2x(uint32x4_t r0[4], uint32x4_t g0[4], uint32x4_t b0[4],
    
                                uint32x4_t r1[4], uint32x4_t g1[4], uint32x4_t b1[4],
    
                                const uint8_t *src_row, const size_t index) {
    
      10800
          uint8x16_t tmp_b0, tmp_b1, tmp_g0, tmp_g1, tmp_r0, tmp_r1;
    
          // Load 32 pixels: two vectors of interleaved channels
    
          if constexpr (kAlpha) {
    
            // 4-channel input (RGBA or BGRA)
    
      5400
            uint8x16x4_t vsrc0 = vld4q_u8(src_row + scn * index);
    
      5400
            uint8x16x4_t vsrc1 =
    
      5400
                vld4q_u8(src_row + scn * index + scn * kVectorLength);
    
      5400
            tmp_b0 = vsrc0.val[b_index_];
    
      5400
            tmp_g0 = vsrc0.val[g_index_];
    
      5400
            tmp_r0 = vsrc0.val[r_index_];
    
      5400
            tmp_b1 = vsrc1.val[b_index_];
    
      5400
            tmp_g1 = vsrc1.val[g_index_];
    
      5400
            tmp_r1 = vsrc1.val[r_index_];
    
      5400
          } else {
    
            // 3-channel input (RGB or BGR)
    
      5400
            uint8x16x3_t vsrc0 = vld3q_u8(src_row + scn * index);
    
      5400
            uint8x16x3_t vsrc1 =
    
      5400
                vld3q_u8(src_row + scn * index + scn * kVectorLength);
    
      5400
            tmp_b0 = vsrc0.val[b_index_];
    
      5400
            tmp_g0 = vsrc0.val[g_index_];
    
      5400
            tmp_r0 = vsrc0.val[r_index_];
    
      5400
            tmp_b1 = vsrc1.val[b_index_];
    
      5400
            tmp_g1 = vsrc1.val[g_index_];
    
      5400
            tmp_r1 = vsrc1.val[r_index_];
    
      5400
          }
    
          // After loading the vector, we extend the channels and separate even and
    
          // odd elements. This separation is important for UV calculation, as only
    
          // the even-indexed values are used.
    
      10800
          uint8x16_t indices[4] = {
    
              0,    0xff, 0xff, 0xff, 2,    0xff, 0xff, 0xff, 4,    0xff, 0xff,
    
              0xff, 6,    0xff, 0xff, 0xff, 1,    0xff, 0xff, 0xff, 3,    0xff,
    
              0xff, 0xff, 5,    0xff, 0xff, 0xff, 7,    0xff, 0xff, 0xff, 8,
    
              0xff, 0xff, 0xff, 10,   0xff, 0xff, 0xff, 12,   0xff, 0xff, 0xff,
    
              14,   0xff, 0xff, 0xff, 9,    0xff, 0xff, 0xff, 11,   0xff, 0xff,
    
              0xff, 13,   0xff, 0xff, 0xff, 15,   0xff, 0xff, 0xff};
    
          // Expand each 8-bit channel into 32-bit vectors using table lookup and
    
          // reinterpret
    
          KLEIDICV_FORCE_LOOP_UNROLL
    
        8/8✓ Branch 0 taken 2700 times.
✓ Branch 1 taken 10800 times.
✓ Branch 2 taken 2700 times.
✓ Branch 3 taken 10800 times.
✓ Branch 4 taken 2700 times.
✓ Branch 5 taken 10800 times.
✓ Branch 6 taken 2700 times.
✓ Branch 7 taken 10800 times.

      54000
          for (int i = 0; i < 4; i++) {
    
      43200
            r0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r0, indices[i]));
    
      43200
            g0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g0, indices[i]));
    
      43200
            b0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b0, indices[i]));
    
      43200
            r1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r1, indices[i]));
    
      43200
            g1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g1, indices[i]));
    
      43200
            b1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b1, indices[i]));
    
      43200
          }
    
      10800
        }
    
        static constexpr size_t r_index_ = RGB ? 0 : 2;
    
        static constexpr size_t g_index_ = 1;
    
        static constexpr size_t b_index_ = RGB ? 2 : 0;
    
        static constexpr size_t scn = kAlpha ? 4 : 3;
    
      };
    
      }  // namespace kleidicv::neon
    
      #endif  // KLEIDICV_RGB_TO_YUV420_H

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#ifndef KLEIDICV_RGB_TO_YUV420_H
6			#define KLEIDICV_RGB_TO_YUV420_H
7
8			#include <algorithm>
9			#include <utility>
10
11			#include "kleidicv/kleidicv.h"
12			#include "kleidicv/neon.h"
13			#include "yuv420_coefficients.h"
14
15			namespace kleidicv::neon {
16
17			template <bool kAlpha, bool RGB, bool kInterleave>
18			class RGBxorBGRxToYUV420 {
19			public:
20		376	static kleidicv_error_t rgb2yuv420_operation(
21			const uint8_t src, size_t src_stride, uint8_t y_dst, size_t y_stride,
22			uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
23			bool v_first, size_t begin, size_t end) {
24		376	size_t row_begin = begin * 2;
25		376	size_t row_end = std::min<size_t>(height, end * 2);
26
27		376	const uint8_t *src_row = nullptr;
28		376	uint8_t *y_row = nullptr;
29		376	uint8_t *u_row = nullptr;
30		376	uint8_t *v_row = nullptr;
31	8/8 ✓ Branch 0 taken 94 times. ✓ Branch 1 taken 4810 times. ✓ Branch 2 taken 94 times. ✓ Branch 3 taken 4810 times. ✓ Branch 4 taken 94 times. ✓ Branch 5 taken 4810 times. ✓ Branch 6 taken 94 times. ✓ Branch 7 taken 4810 times.	19616	for (size_t h = row_begin; h < row_end; h++) {
32		19240	src_row = src + src_stride * h;
33		19240	y_row = y_dst + y_stride * h;
34
35		19240	bool evenRow = (h & 1) == 0;
36
37	8/8 ✓ Branch 0 taken 2382 times. ✓ Branch 1 taken 2428 times. ✓ Branch 2 taken 2382 times. ✓ Branch 3 taken 2428 times. ✓ Branch 4 taken 2382 times. ✓ Branch 5 taken 2428 times. ✓ Branch 6 taken 2382 times. ✓ Branch 7 taken 2428 times.	19240	if (evenRow) {
38			if constexpr (kInterleave) {
39		4912	u_row = uv_dst + uv_stride * (h / 2);
40			} else {
41		4800	u_row =
42		4800	uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2);
43			// Pointer to the start of the V plane.
44			// The V plane follows the U plane. Both U and V planes are
45			// subsampled at a 2:1 vertical ratio (i.e., each has height / 2
46			// rows) and stored in a single contiguous chroma region.
47			// Depending on image height and stride, the starting offset
48			// of V may require adjustment, so a
49			// fractional offset (in rows) is applied to calculate the V plane
50			// position.
51		9600	v_row = uv_dst + uv_stride * ((h + height + 1) / 4) +
52		4800	(((h + height + 1) / 2) % 2) * ((width + 1) / 2);
53			}
54		9712	}
55
56		19240	LoopUnroll2<TryToAvoidTailLoop> loop{width, kVectorLength};
57		30040	loop.unroll_twice([&](size_t index) {
58		10800	vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow);
59		10800	});
60
61		77552	loop.tail([&](size_t index) {
62		116624	scalar_path(src_row, y_row, u_row, v_row, v_first, index, width,
63		58312	evenRow);
64		58312	});
65		19240	}
66
67		376	return KLEIDICV_OK;
68		376	}
69
70			private:
71		10800	static void vector_path_2x(const uint8_t src_row, uint8_t y_row,
72			uint8_t u_row, uint8_t v_row, const bool v_first,
73			const size_t index, const bool evenRow) {
74		10800	uint32x4_t r0[4], g0[4], b0[4], r1[4], g1[4], b1[4];
75
76		10800	load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, index);
77
78		10800	uint8x16_t y0 = rgb_to_y(r0, g0, b0);
79
80		10800	uint8x16_t y1 = rgb_to_y(r1, g1, b1);
81
82		10800	vst1q_u8(y_row + index, y0);
83		10800	vst1q_u8(y_row + index + kVectorLength, y1);
84
85			// U and V are subsampled by a factor of 2 in both horizontal and vertical
86			// directions for YUV420 format. Therefore, we only compute U and V from
87			// even rows and even columns. When the input RGB image has an odd width or
88			// height, the chroma (U and V) dimensions are rounded up. For example, if
89			// the height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2
90			// = 4.5 -> rounded up). The same rounding is applied for width.
91	8/8 ✓ Branch 0 taken 1300 times. ✓ Branch 1 taken 1400 times. ✓ Branch 2 taken 1300 times. ✓ Branch 3 taken 1400 times. ✓ Branch 4 taken 1300 times. ✓ Branch 5 taken 1400 times. ✓ Branch 6 taken 1300 times. ✓ Branch 7 taken 1400 times.	10800	if (evenRow) {
92		5600	uint8x16x2_t uv;
93		5600	int32x4_t r_even[4] = {r0[0], r0[2], r1[0], r1[2]};
94		5600	int32x4_t g_even[4] = {g0[0], g0[2], g1[0], g1[2]};
95		5600	int32x4_t b_even[4] = {b0[0], b0[2], b1[0], b1[2]};
96		5600	rgb_to_uv_2x(r_even, g_even, b_even, uv.val[0], uv.val[1]);
97
98	8/8 ✓ Branch 0 taken 700 times. ✓ Branch 1 taken 700 times. ✓ Branch 2 taken 700 times. ✓ Branch 3 taken 700 times. ✓ Branch 4 taken 700 times. ✓ Branch 5 taken 700 times. ✓ Branch 6 taken 700 times. ✓ Branch 7 taken 700 times.	5600	if (v_first) {
99		2800	std::swap(uv.val[0], uv.val[1]);
100		2800	}
101
102			if constexpr (kInterleave) {
103		2800	vst2q_u8(u_row + index, uv);
104			} else {
105		2800	vst1q_u8(u_row + index / 2, uv.val[0]);
106		2800	vst1q_u8(v_row + index / 2, uv.val[1]);
107			}
108		5600	}
109		10800	}
110
111		58312	static void scalar_path(const uint8_t src_row, uint8_t y_row,
112			uint8_t u_row, uint8_t v_row, const bool v_first,
113			size_t index, const size_t length,
114			const bool evenRow) {
115		58312	const size_t u_index_ = v_first;
116		58312	const size_t v_index_ = !v_first;
117
118	8/8 ✓ Branch 0 taken 65246 times. ✓ Branch 1 taken 14578 times. ✓ Branch 2 taken 65246 times. ✓ Branch 3 taken 14578 times. ✓ Branch 4 taken 65246 times. ✓ Branch 5 taken 14578 times. ✓ Branch 6 taken 65246 times. ✓ Branch 7 taken 14578 times.	319296	for (; index < length; index += 1) {
119		260984	uint8_t b0{}, g0{}, r0{};
120		260984	bool evenCol = (index & 1) == 0;
121		260984	b0 = src_row[index * scn + b_index_];
122		260984	g0 = src_row[index * scn + g_index_];
123		260984	r0 = src_row[index * scn + r_index_];
124
125		260984	uint8_t y0 = rgb_to_y(r0, g0, b0);
126		260984	y_row[index] = y0;
127
128			// U and V are subsampled by a factor of 2 in both horizontal and vertical
129			// directions
130			// for YUV420 format. Therefore, we only compute U and V from even rows
131			// and even columns. When the input RGB image has an odd width or height,
132			// the chroma (U and V) dimensions are rounded up. For example, if the
133			// height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 = 4.5
134			// -> rounded up). The same rounding is applied for width.
135	16/16 ✓ Branch 0 taken 33384 times. ✓ Branch 1 taken 31862 times. ✓ Branch 2 taken 18092 times. ✓ Branch 3 taken 15292 times. ✓ Branch 4 taken 33384 times. ✓ Branch 5 taken 31862 times. ✓ Branch 6 taken 18092 times. ✓ Branch 7 taken 15292 times. ✓ Branch 8 taken 33384 times. ✓ Branch 9 taken 31862 times. ✓ Branch 10 taken 18092 times. ✓ Branch 11 taken 15292 times. ✓ Branch 12 taken 33384 times. ✓ Branch 13 taken 31862 times. ✓ Branch 14 taken 18092 times. ✓ Branch 15 taken 15292 times.	260984	if (evenRow && evenCol) {
136		61168	uint8_t uv[2] = {0, 0};
137		61168	rgb_to_uv(r0, g0, b0, uv);
138			if constexpr (kInterleave) {
139		37248	u_row[index] = uv[u_index_];
140		37248	u_row[index + 1] = uv[v_index_];
141			} else {
142		23920	u_row[(index + 1) / 2] = uv[u_index_];
143		23920	v_row[(index + 1) / 2] = uv[v_index_];
144			}
145		61168	}
146		260984	}
147		58312	}
148
149		260984	static uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) {
150		260984	const int kShifted16 = (16 << kWeightScale);
151		260984	const int kHalfShift = (1 << (kWeightScale - 1));
152		521968	int yy =
153		260984	kRYWeight * r + kGYWeight * g + kBYWeight * b + kHalfShift + kShifted16;
154
155		521968	return std::clamp(yy >> kWeightScale, 0, 0xff);
156		260984	}
157
158		21600	static uint8x16_t rgb_to_y(const uint32x4_t r[4], const uint32x4_t g[4],
159			const uint32x4_t b[4]) {
160		21600	const int kShifted16 = (16 << kWeightScale);
161		21600	const int kHalfShift = (1 << (kWeightScale - 1));
162
163			// Y = kRR + kGG + kB*B + rounding bias
164		21600	uint32x4_t v_kRYWeight = vdupq_n_u32(kRYWeight);
165		21600	uint32x4_t v_kGYWeight = vdupq_n_u32(kGYWeight);
166		21600	uint32x4_t v_kBYWeight = vdupq_n_u32(kBYWeight);
167		21600	uint32x4_t y[4];
168
169			KLEIDICV_FORCE_LOOP_UNROLL
170	8/8 ✓ Branch 0 taken 5400 times. ✓ Branch 1 taken 21600 times. ✓ Branch 2 taken 5400 times. ✓ Branch 3 taken 21600 times. ✓ Branch 4 taken 5400 times. ✓ Branch 5 taken 21600 times. ✓ Branch 6 taken 5400 times. ✓ Branch 7 taken 21600 times.	108000	for (int i = 0; i < 4; i++) {
171		86400	y[i] = vdupq_n_u32(kHalfShift + kShifted16);
172		86400	y[i] = vmlaq_u32(y[i], r[i], v_kRYWeight);
173		86400	y[i] = vmlaq_u32(y[i], g[i], v_kGYWeight);
174		86400	y[i] = vmlaq_u32(y[i], b[i], v_kBYWeight);
175		86400	}
176
177		43200	return normalize_and_pack_y(y);
178		21600	}
179
180		61168	static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t uv[2]) {
181		61168	const int kHalfShift = (1 << (kWeightScale - 1));
182		61168	const int kShifted128 = (128 << kWeightScale);
183		61168	int uu = kRUWeight * r + kGUWeight * g + kBUWeight * b + kHalfShift +
184			kShifted128;
185		61168	int vv = kBUWeight * r + kGVWeight * g + kBVWeight * b + kHalfShift +
186			kShifted128;
187
188		61168	uv[0] = std::clamp(uu >> kWeightScale, 0, 0xff);
189		61168	uv[1] = std::clamp(vv >> kWeightScale, 0, 0xff);
190		61168	}
191
192		11200	static uint8x16_t compute_u_or_v_2x(const int32x4_t r[4],
193			const int32x4_t g[4],
194			const int32x4_t b[4], const int r_coeff,
195			const int g_coeff, const int b_coeff) {
196			// Constants for U/V calculation
197		11200	const int kHalfShift = (1 << (kWeightScale - 1));
198		11200	const int kShifted128 = (128 << kWeightScale);
199
200		11200	int32x4_t v_r_coeff = vdupq_n_s32(r_coeff);
201		11200	int32x4_t v_g_coeff = vdupq_n_s32(g_coeff);
202		11200	int32x4_t v_b_coeff = vdupq_n_s32(b_coeff);
203		11200	int32x4_t uv[4];
204
205			KLEIDICV_FORCE_LOOP_UNROLL
206	8/8 ✓ Branch 0 taken 2800 times. ✓ Branch 1 taken 11200 times. ✓ Branch 2 taken 2800 times. ✓ Branch 3 taken 11200 times. ✓ Branch 4 taken 2800 times. ✓ Branch 5 taken 11200 times. ✓ Branch 6 taken 2800 times. ✓ Branch 7 taken 11200 times.	56000	for (int i = 0; i < 4; i++) {
207		44800	uv[i] = vdupq_n_s32(kHalfShift + kShifted128);
208		44800	uv[i] = vmlaq_s32(uv[i], r[i], v_r_coeff);
209		44800	uv[i] = vmlaq_s32(uv[i], g[i], v_g_coeff);
210		44800	uv[i] = vmlaq_s32(uv[i], b[i], v_b_coeff);
211		44800	}
212
213		22400	return normalize_and_pack_u_or_v(uv);
214		11200	}
215
216		5600	static void rgb_to_uv_2x(const int32x4_t r[4], const int32x4_t g[4],
217			const int32x4_t b[4], uint8x16_t &u, uint8x16_t &v) {
218			// ---------------- U (Cb) Component ----------------
219			// U = R * kRU + G * kGU + B * kBU + bias
220		5600	u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight);
221
222			// ---------------- V (Cr) Component ----------------
223			// V = R * kBU + G * kGV + B * kBV + bias
224		5600	v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight);
225		5600	}
226
227		21600	static uint8x16_t normalize_and_pack_y(uint32x4_t vec[4]) {
228			// The y_index table selects the correct output order after normalization.
229			// When we load and separate the RGB values for UV calculation, we
230			// deinterleave them into even and odd components. As a result, the
231			// processed values are stored in two separate vectors. During
232			// normalization, we need to interleave them again to produce the final
233			// contiguous output, and this index pattern achieves that.
234		21600	uint8x16_t y_index = {1, 17, 3, 19, 5, 21, 7, 23,
235			9, 25, 11, 27, 13, 29, 15, 31};
236
237			// Normalize down by right-shifting the fixed-point result
238			// vshrn_n can only shift by an immediate value between 1 and 16.
239			// Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
240			// bits. This ensures that the most relevant 8-bit result lies in the second
241			// byte of each 16-bit element. As a result, the lookup tables are
242			// constructed with only odd indices to extract the second byte from each
243			// element.
244		21600	uint16x4_t tmp_lo_lo = vshrn_n_u32(vec[0], kWeightScale - 8);
245		43200	uint16x8_t tmp_lo_hi =
246		21600	vshrn_high_n_u32(tmp_lo_lo, vec[2], kWeightScale - 8);
247		21600	uint16x4_t tmp_hi_lo = vshrn_n_u32(vec[1], kWeightScale - 8);
248		43200	uint16x8_t tmp_hi_hi =
249		21600	vshrn_high_n_u32(tmp_hi_lo, vec[3], kWeightScale - 8);
250
251		21600	uint8x16x2_t tmp;
252		21600	tmp.val[0] = vreinterpretq_u8(tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14
253		21600	tmp.val[1] = vreinterpretq_u8(tmp_hi_hi); // 1, 3, 5, 7, 9, 11, 13, 15
254
255		21600	uint8x16_t output = vqtbl2q_u8(tmp, y_index);
256
257		43200	return output;
258		21600	}
259
260		11200	static uint8x16_t normalize_and_pack_u_or_v(int32x4_t vec[4]) {
261			// The uv_index table is used to finalize the order of U and V values.
262			// Unlike the Y component, we don't need to interleave even and odd elements
263			// manually. This is because the first vector already contains even-indexed
264			// values from the lower RGB block, and the second vector contains
265			// even-indexed values from the higher RGB block. As a result, the values
266			// are already sorted in the correct order for output.
267		11200	uint8x16_t uv_index = {1, 3, 5, 7, 9, 11, 13, 15,
268			17, 19, 21, 23, 25, 27, 29, 31};
269
270			// Normalize down by right-shifting the fixed-point result
271			// vshrn_n can only shift by an immediate value between 1 and 16.
272			// Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12
273			// bits. This ensures that the most relevant 8-bit result lies in the second
274			// byte of each 16-bit element. As a result, the lookup tables are
275			// constructed with only odd indices to extract the second byte from each
276			// element.
277		11200	int16x4_t tmp_lo_lo = vshrn_n_s32(vec[0], kWeightScale - 8);
278		11200	int16x8_t tmp_lo_hi = vshrn_high_n_s32(tmp_lo_lo, vec[1], kWeightScale - 8);
279		11200	int16x4_t tmp_hi_lo = vshrn_n_s32(vec[2], kWeightScale - 8);
280		11200	int16x8_t tmp_hi_hi = vshrn_high_n_s32(tmp_hi_lo, vec[3], kWeightScale - 8);
281
282		11200	uint8x16x2_t tmp;
283		11200	tmp.val[0] = vreinterpretq_u8(
284		11200	tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the first vector
285		11200	tmp.val[1] = vreinterpretq_u8(
286		11200	tmp_hi_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the second vector
287		11200	uint8x16_t output = vqtbl2q_u8(tmp, uv_index);
288
289		22400	return output;
290		11200	}
291
292		10800	static void load_rgb_2x(uint32x4_t r0[4], uint32x4_t g0[4], uint32x4_t b0[4],
293			uint32x4_t r1[4], uint32x4_t g1[4], uint32x4_t b1[4],
294			const uint8_t *src_row, const size_t index) {
295		10800	uint8x16_t tmp_b0, tmp_b1, tmp_g0, tmp_g1, tmp_r0, tmp_r1;
296			// Load 32 pixels: two vectors of interleaved channels
297
298			if constexpr (kAlpha) {
299			// 4-channel input (RGBA or BGRA)
300		5400	uint8x16x4_t vsrc0 = vld4q_u8(src_row + scn * index);
301		5400	uint8x16x4_t vsrc1 =
302		5400	vld4q_u8(src_row + scn * index + scn * kVectorLength);
303
304		5400	tmp_b0 = vsrc0.val[b_index_];
305		5400	tmp_g0 = vsrc0.val[g_index_];
306		5400	tmp_r0 = vsrc0.val[r_index_];
307
308		5400	tmp_b1 = vsrc1.val[b_index_];
309		5400	tmp_g1 = vsrc1.val[g_index_];
310		5400	tmp_r1 = vsrc1.val[r_index_];
311		5400	} else {
312			// 3-channel input (RGB or BGR)
313		5400	uint8x16x3_t vsrc0 = vld3q_u8(src_row + scn * index);
314		5400	uint8x16x3_t vsrc1 =
315		5400	vld3q_u8(src_row + scn * index + scn * kVectorLength);
316
317		5400	tmp_b0 = vsrc0.val[b_index_];
318		5400	tmp_g0 = vsrc0.val[g_index_];
319		5400	tmp_r0 = vsrc0.val[r_index_];
320
321		5400	tmp_b1 = vsrc1.val[b_index_];
322		5400	tmp_g1 = vsrc1.val[g_index_];
323		5400	tmp_r1 = vsrc1.val[r_index_];
324		5400	}
325			// After loading the vector, we extend the channels and separate even and
326			// odd elements. This separation is important for UV calculation, as only
327			// the even-indexed values are used.
328		10800	uint8x16_t indices[4] = {
329			0, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 4, 0xff, 0xff,
330			0xff, 6, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 3, 0xff,
331			0xff, 0xff, 5, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 8,
332			0xff, 0xff, 0xff, 10, 0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff,
333			14, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, 11, 0xff, 0xff,
334			0xff, 13, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
335
336			// Expand each 8-bit channel into 32-bit vectors using table lookup and
337			// reinterpret
338			KLEIDICV_FORCE_LOOP_UNROLL
339	8/8 ✓ Branch 0 taken 2700 times. ✓ Branch 1 taken 10800 times. ✓ Branch 2 taken 2700 times. ✓ Branch 3 taken 10800 times. ✓ Branch 4 taken 2700 times. ✓ Branch 5 taken 10800 times. ✓ Branch 6 taken 2700 times. ✓ Branch 7 taken 10800 times.	54000	for (int i = 0; i < 4; i++) {
340		43200	r0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r0, indices[i]));
341		43200	g0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g0, indices[i]));
342		43200	b0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b0, indices[i]));
343		43200	r1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r1, indices[i]));
344		43200	g1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g1, indices[i]));
345		43200	b1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b1, indices[i]));
346		43200	}
347		10800	}
348
349			static constexpr size_t r_index_ = RGB ? 0 : 2;
350			static constexpr size_t g_index_ = 1;
351			static constexpr size_t b_index_ = RGB ? 2 : 0;
352			static constexpr size_t scn = kAlpha ? 4 : 3;
353			};
354
355			} // namespace kleidicv::neon
356
357			#endif // KLEIDICV_RGB_TO_YUV420_H
358