| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/conversions/rgb_to_yuv.h" | ||
| 6 | #include "kleidicv/kleidicv.h" | ||
| 7 | #include "kleidicv/neon.h" | ||
| 8 | #include "rgb_to_yuv444_coefficients.h" | ||
| 9 | namespace kleidicv::neon { | ||
| 10 | |||
| 11 | template <bool BGR, bool kAlpha> | ||
| 12 | class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop { | ||
| 13 | public: | ||
| 14 | using VecTraits = neon::VecTraits<uint8_t>; | ||
| 15 | using ScalarType = VecTraits::ScalarType; | ||
| 16 | using VectorType = VecTraits::VectorType; | ||
| 17 | using RawSourceVectorType = | ||
| 18 | typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type; | ||
| 19 | |||
| 20 | explicit RGBToYUVAll() = default; | ||
| 21 | |||
| 22 | // Returns the number of channels in the input image. | ||
| 23 | 260 | static constexpr size_t input_channels() { | |
| 24 | 260 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
| 25 | } | ||
| 26 | |||
| 27 | KLEIDICV_FORCE_INLINE | ||
| 28 | 1904 | void vector_path(const ScalarType *src, ScalarType *dst) { | |
| 29 | 1904 | RawSourceVectorType vsrc; | |
| 30 | 1904 | int16x8_t r_l, r_h, g_l, g_h, b_l, b_h; | |
| 31 | if constexpr (kAlpha) { | ||
| 32 | 952 | VecTraits::load(src, vsrc); | |
| 33 | |||
| 34 | 952 | uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]); | |
| 35 | 952 | uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]); | |
| 36 | if constexpr (BGR) { | ||
| 37 | 476 | b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0))); | |
| 38 | 476 | b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0))); | |
| 39 | 476 | r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0))); | |
| 40 | 476 | r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0))); | |
| 41 | } else { | ||
| 42 | 476 | r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0))); | |
| 43 | 476 | r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0))); | |
| 44 | 476 | b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0))); | |
| 45 | 476 | b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0))); | |
| 46 | } | ||
| 47 | 952 | uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]); | |
| 48 | 952 | g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0))); | |
| 49 | 952 | uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]); | |
| 50 | 952 | g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0))); | |
| 51 | 952 | } else { | |
| 52 | // Load deinterleaved | ||
| 53 | 952 | vsrc = vld3q_u8(src); | |
| 54 | 952 | r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0))); | |
| 55 | 952 | r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0))); | |
| 56 | 952 | g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0))); | |
| 57 | 952 | g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0))); | |
| 58 | 952 | b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0))); | |
| 59 | 952 | b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0))); | |
| 60 | } | ||
| 61 | // Compute Y value in 32-bit precision | ||
| 62 | 1904 | int16x8_t y_l, y_h; | |
| 63 | { | ||
| 64 | 1904 | int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight); | |
| 65 | 1904 | int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight); | |
| 66 | 1904 | int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight); | |
| 67 | 1904 | int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight); | |
| 68 | |||
| 69 | 1904 | y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight); | |
| 70 | 1904 | y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight); | |
| 71 | 1904 | y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight); | |
| 72 | 1904 | y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight); | |
| 73 | |||
| 74 | 1904 | y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight); | |
| 75 | 1904 | y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight); | |
| 76 | 1904 | y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight); | |
| 77 | 1904 | y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight); | |
| 78 | |||
| 79 | 1904 | y_l = combine_scaled_s16(y_ll, y_lh); | |
| 80 | 1904 | y_h = combine_scaled_s16(y_hl, y_hh); | |
| 81 | 1904 | } | |
| 82 | |||
| 83 | // Using the 16-bit Y value, calculate U | ||
| 84 | 1904 | int16x8_t u_l, u_h; | |
| 85 | { | ||
| 86 | 1904 | int16x8_t uy_l = vqsubq(b_l, y_l); | |
| 87 | 1904 | int16x8_t uy_h = vqsubq(b_h, y_h); | |
| 88 | |||
| 89 | 1904 | int32x4_t u_ll = vdupq_n_s32(half_); | |
| 90 | 1904 | int32x4_t u_lh = u_ll; | |
| 91 | 1904 | int32x4_t u_hl = u_ll; | |
| 92 | 1904 | int32x4_t u_hh = u_ll; | |
| 93 | |||
| 94 | 1904 | u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight); | |
| 95 | 1904 | u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight); | |
| 96 | 1904 | u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight); | |
| 97 | 1904 | u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight); | |
| 98 | |||
| 99 | 1904 | u_l = combine_scaled_s16(u_ll, u_lh); | |
| 100 | 1904 | u_h = combine_scaled_s16(u_hl, u_hh); | |
| 101 | 1904 | } | |
| 102 | |||
| 103 | // Using the 16-bit Y value, calculate V | ||
| 104 | 1904 | int16x8_t v_l, v_h; | |
| 105 | { | ||
| 106 | 1904 | int16x8_t vy_l = vqsubq(r_l, y_l); | |
| 107 | 1904 | int16x8_t vy_h = vqsubq(r_h, y_h); | |
| 108 | |||
| 109 | 1904 | int32x4_t v_ll = vdupq_n_s32(half_); | |
| 110 | 1904 | int32x4_t v_lh = v_ll; | |
| 111 | 1904 | int32x4_t v_hl = v_ll; | |
| 112 | 1904 | int32x4_t v_hh = v_ll; | |
| 113 | |||
| 114 | 1904 | v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight); | |
| 115 | 1904 | v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight); | |
| 116 | 1904 | v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight); | |
| 117 | 1904 | v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight); | |
| 118 | |||
| 119 | 1904 | v_l = combine_scaled_s16(v_ll, v_lh); | |
| 120 | 1904 | v_h = combine_scaled_s16(v_hl, v_hh); | |
| 121 | 1904 | } | |
| 122 | |||
| 123 | // Narrow the results to 8 bits | ||
| 124 | 1904 | uint8x16x3_t yuv; | |
| 125 | 1904 | yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h)); | |
| 126 | 1904 | yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h)); | |
| 127 | 1904 | yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h)); | |
| 128 | |||
| 129 | // Store interleaved YUV pixels to memory. | ||
| 130 | 1904 | vst3q_u8(dst, yuv); | |
| 131 | 1904 | } | |
| 132 | |||
| 133 | 492 | void scalar_path(const ScalarType *src, ScalarType *dst) { | |
| 134 | 984 | int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight + | |
| 135 | 492 | src[b_index_] * kBYWeight; | |
| 136 | 492 | y = rounding_shift_right(y, kWeightScale); | |
| 137 | 492 | int32_t u = (src[b_index_] - y) * kBUWeight + half_; | |
| 138 | 492 | u = rounding_shift_right(u, kWeightScale); | |
| 139 | 492 | int32_t v = (src[r_index_] - y) * kRVWeight + half_; | |
| 140 | 492 | v = rounding_shift_right(v, kWeightScale); | |
| 141 | 492 | dst[0] = saturating_cast<int32_t, uint8_t>(y); | |
| 142 | 492 | dst[1] = saturating_cast<int32_t, uint8_t>(u); | |
| 143 | 492 | dst[2] = saturating_cast<int32_t, uint8_t>(v); | |
| 144 | 492 | } | |
| 145 | |||
| 146 | private: | ||
| 147 | static constexpr size_t r_index_ = BGR ? 2 : 0; | ||
| 148 | static constexpr size_t g_index_ = 1; | ||
| 149 | static constexpr size_t b_index_ = BGR ? 0 : 2; | ||
| 150 | static constexpr size_t step_ = kAlpha ? 4 : 3; | ||
| 151 | static constexpr uint32_t half_ = | ||
| 152 | (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale; | ||
| 153 | |||
| 154 | KLEIDICV_FORCE_INLINE | ||
| 155 | 11424 | static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) { | |
| 156 | 11424 | return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale); | |
| 157 | } | ||
| 158 | }; // end of class RGBToYUVAll<bool BGR, bool kAlpha> | ||
| 159 | |||
| 160 | template <typename OperationType, typename ScalarType> | ||
| 161 | 336 | kleidicv_error_t rgb2yuv_operation(OperationType &operation, | |
| 162 | const ScalarType *src, size_t src_stride, | ||
| 163 | ScalarType *dst, size_t dst_stride, | ||
| 164 | size_t width, size_t height) { | ||
| 165 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 80 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 80 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 80 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 80 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 80 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 80 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 80 times.
|
336 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 166 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 76 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 76 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 76 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 76 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 76 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 76 times.
|
320 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 167 |
24/24✓ Branch 0 taken 6 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 65 times.
✓ Branch 4 taken 11 times.
✓ Branch 5 taken 65 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 70 times.
✓ Branch 8 taken 5 times.
✓ Branch 9 taken 65 times.
✓ Branch 10 taken 11 times.
✓ Branch 11 taken 65 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 70 times.
✓ Branch 14 taken 5 times.
✓ Branch 15 taken 65 times.
✓ Branch 16 taken 11 times.
✓ Branch 17 taken 65 times.
✓ Branch 18 taken 6 times.
✓ Branch 19 taken 70 times.
✓ Branch 20 taken 5 times.
✓ Branch 21 taken 65 times.
✓ Branch 22 taken 11 times.
✓ Branch 23 taken 65 times.
|
304 | CHECK_IMAGE_SIZE(width, height); |
| 168 | |||
| 169 | 260 | Rectangle rect{width, height}; | |
| 170 | 260 | Rows src_rows{src, src_stride, operation.input_channels()}; | |
| 171 | 260 | Rows dst_rows{dst, dst_stride, 3}; | |
| 172 | |||
| 173 | 260 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 174 | 260 | return KLEIDICV_OK; | |
| 175 | 336 | } | |
| 176 | |||
| 177 | using RGBToYUV = RGBToYUVAll<false, false>; | ||
| 178 | using RGBAToYUV = RGBToYUVAll<false, true>; | ||
| 179 | using BGRToYUV = RGBToYUVAll<true, false>; | ||
| 180 | using BGRAToYUV = RGBToYUVAll<true, true>; | ||
| 181 | |||
| 182 | KLEIDICV_TARGET_FN_ATTRS | ||
| 183 | 360 | kleidicv_error_t rgb_to_yuv444_u8(const uint8_t *src, size_t src_stride, | |
| 184 | uint8_t *dst, size_t dst_stride, size_t width, | ||
| 185 | size_t height, | ||
| 186 | kleidicv_color_conversion_t color_format) { | ||
| 187 |
5/5✓ Branch 0 taken 84 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 84 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
|
360 | switch (color_format) { |
| 188 | case KLEIDICV_RGB_TO_YUV444: { | ||
| 189 | 84 | RGBToYUV operation; | |
| 190 | 168 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, | |
| 191 | 84 | width, height); | |
| 192 | 84 | } | |
| 193 | |||
| 194 | case KLEIDICV_BGR_TO_YUV444: { | ||
| 195 | 84 | BGRToYUV operation; | |
| 196 | 168 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, | |
| 197 | 84 | width, height); | |
| 198 | 84 | } | |
| 199 | |||
| 200 | case KLEIDICV_RGBA_TO_YUV444: { | ||
| 201 | 84 | RGBAToYUV operation; | |
| 202 | 168 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, | |
| 203 | 84 | width, height); | |
| 204 | 84 | } | |
| 205 | |||
| 206 | case KLEIDICV_BGRA_TO_YUV444: { | ||
| 207 | 84 | BGRAToYUV operation; | |
| 208 | 168 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, | |
| 209 | 84 | width, height); | |
| 210 | 84 | } | |
| 211 | |||
| 212 | default: | ||
| 213 | 24 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 214 | } | ||
| 215 | |||
| 216 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | ||
| 217 | 360 | } | |
| 218 | |||
| 219 | } // namespace kleidicv::neon | ||
| 220 |