| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/conversions/rgb_to_yuv.h" | ||
| 6 | #include "kleidicv/kleidicv.h" | ||
| 7 | #include "kleidicv/neon.h" | ||
| 8 | |||
| 9 | namespace kleidicv::neon { | ||
| 10 | |||
| 11 | template <bool BGR, bool kAlpha> | ||
| 12 | class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop { | ||
| 13 | public: | ||
| 14 | using VecTraits = neon::VecTraits<uint8_t>; | ||
| 15 | using ScalarType = VecTraits::ScalarType; | ||
| 16 | using VectorType = VecTraits::VectorType; | ||
| 17 | using RawSourceVectorType = | ||
| 18 | typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type; | ||
| 19 | |||
| 20 | explicit RGBToYUVAll() = default; | ||
| 21 | |||
| 22 | // Returns the number of channels in the input image. | ||
| 23 | 292 | static constexpr size_t input_channels() { | |
| 24 | 292 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
| 25 | } | ||
| 26 | |||
| 27 | 1920 | void vector_path(const ScalarType *src, ScalarType *dst) { | |
| 28 | 1920 | RawSourceVectorType vsrc; | |
| 29 | 1920 | int16x8_t r_l, r_h, g_l, g_h, b_l, b_h; | |
| 30 | if constexpr (kAlpha) { | ||
| 31 | 960 | VecTraits::load(src, vsrc); | |
| 32 | |||
| 33 | 960 | uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]); | |
| 34 | 960 | uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]); | |
| 35 | if constexpr (BGR) { | ||
| 36 | 480 | b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0))); | |
| 37 | 480 | b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0))); | |
| 38 | 480 | r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0))); | |
| 39 | 480 | r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0))); | |
| 40 | } else { | ||
| 41 | 480 | r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0))); | |
| 42 | 480 | r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0))); | |
| 43 | 480 | b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0))); | |
| 44 | 480 | b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0))); | |
| 45 | } | ||
| 46 | 960 | uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]); | |
| 47 | 960 | g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0))); | |
| 48 | 960 | uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]); | |
| 49 | 960 | g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0))); | |
| 50 | 960 | } else { | |
| 51 | // Load deinterleaved | ||
| 52 | 960 | vsrc = vld3q_u8(src); | |
| 53 | 960 | r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0))); | |
| 54 | 960 | r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0))); | |
| 55 | 960 | g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0))); | |
| 56 | 960 | g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0))); | |
| 57 | 960 | b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0))); | |
| 58 | 960 | b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0))); | |
| 59 | } | ||
| 60 | // Compute Y value in 32-bit precision | ||
| 61 | 1920 | int16x8_t y_l, y_h; | |
| 62 | { | ||
| 63 | 1920 | int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight); | |
| 64 | 1920 | int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight); | |
| 65 | 1920 | int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight); | |
| 66 | 1920 | int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight); | |
| 67 | |||
| 68 | 1920 | y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight); | |
| 69 | 1920 | y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight); | |
| 70 | 1920 | y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight); | |
| 71 | 1920 | y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight); | |
| 72 | |||
| 73 | 1920 | y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight); | |
| 74 | 1920 | y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight); | |
| 75 | 1920 | y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight); | |
| 76 | 1920 | y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight); | |
| 77 | |||
| 78 | 1920 | y_l = combine_scaled_s16(y_ll, y_lh); | |
| 79 | 1920 | y_h = combine_scaled_s16(y_hl, y_hh); | |
| 80 | 1920 | } | |
| 81 | |||
| 82 | // Using the 16-bit Y value, calculate U | ||
| 83 | 1920 | int16x8_t u_l, u_h; | |
| 84 | { | ||
| 85 | 1920 | int16x8_t uy_l = vqsubq(b_l, y_l); | |
| 86 | 1920 | int16x8_t uy_h = vqsubq(b_h, y_h); | |
| 87 | |||
| 88 | 1920 | int32x4_t u_ll = vdupq_n_s32(half_); | |
| 89 | 1920 | int32x4_t u_lh = u_ll; | |
| 90 | 1920 | int32x4_t u_hl = u_ll; | |
| 91 | 1920 | int32x4_t u_hh = u_ll; | |
| 92 | |||
| 93 | 1920 | u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight); | |
| 94 | 1920 | u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight); | |
| 95 | 1920 | u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight); | |
| 96 | 1920 | u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight); | |
| 97 | |||
| 98 | 1920 | u_l = combine_scaled_s16(u_ll, u_lh); | |
| 99 | 1920 | u_h = combine_scaled_s16(u_hl, u_hh); | |
| 100 | 1920 | } | |
| 101 | |||
| 102 | // Using the 16-bit Y value, calculate V | ||
| 103 | 1920 | int16x8_t v_l, v_h; | |
| 104 | { | ||
| 105 | 1920 | int16x8_t vy_l = vqsubq(r_l, y_l); | |
| 106 | 1920 | int16x8_t vy_h = vqsubq(r_h, y_h); | |
| 107 | |||
| 108 | 1920 | int32x4_t v_ll = vdupq_n_s32(half_); | |
| 109 | 1920 | int32x4_t v_lh = v_ll; | |
| 110 | 1920 | int32x4_t v_hl = v_ll; | |
| 111 | 1920 | int32x4_t v_hh = v_ll; | |
| 112 | |||
| 113 | 1920 | v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight); | |
| 114 | 1920 | v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight); | |
| 115 | 1920 | v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight); | |
| 116 | 1920 | v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight); | |
| 117 | |||
| 118 | 1920 | v_l = combine_scaled_s16(v_ll, v_lh); | |
| 119 | 1920 | v_h = combine_scaled_s16(v_hl, v_hh); | |
| 120 | 1920 | } | |
| 121 | |||
| 122 | // Narrow the results to 8 bits | ||
| 123 | 1920 | uint8x16x3_t yuv; | |
| 124 | 1920 | yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h)); | |
| 125 | 1920 | yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h)); | |
| 126 | 1920 | yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h)); | |
| 127 | |||
| 128 | // Store interleaved YUV pixels to memory. | ||
| 129 | 1920 | vst3q_u8(dst, yuv); | |
| 130 | 1920 | } | |
| 131 | |||
| 132 | 412 | void scalar_path(const ScalarType *src, ScalarType *dst) { | |
| 133 | 824 | int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight + | |
| 134 | 412 | src[b_index_] * kBYWeight; | |
| 135 | 412 | y = rounding_shift_right(y, kWeightScale); | |
| 136 | 412 | int32_t u = (src[b_index_] - y) * kBUWeight + half_; | |
| 137 | 412 | u = rounding_shift_right(u, kWeightScale); | |
| 138 | 412 | int32_t v = (src[r_index_] - y) * kRVWeight + half_; | |
| 139 | 412 | v = rounding_shift_right(v, kWeightScale); | |
| 140 | 412 | dst[0] = saturating_cast<int32_t, uint8_t>(y); | |
| 141 | 412 | dst[1] = saturating_cast<int32_t, uint8_t>(u); | |
| 142 | 412 | dst[2] = saturating_cast<int32_t, uint8_t>(v); | |
| 143 | 412 | } | |
| 144 | |||
| 145 | private: | ||
| 146 | static constexpr size_t r_index_ = BGR ? 2 : 0; | ||
| 147 | static constexpr size_t g_index_ = 1; | ||
| 148 | static constexpr size_t b_index_ = BGR ? 0 : 2; | ||
| 149 | static constexpr size_t step_ = kAlpha ? 4 : 3; | ||
| 150 | static constexpr uint32_t half_ = | ||
| 151 | (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale; | ||
| 152 | |||
| 153 | 11520 | static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) { | |
| 154 | 11520 | return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale); | |
| 155 | } | ||
| 156 | }; // end of class RGBToYUVAll<bool BGR, bool kAlpha> | ||
| 157 | |||
| 158 | template <typename OperationType, typename ScalarType> | ||
| 159 | 356 | kleidicv_error_t rgb2yuv_operation(OperationType &operation, | |
| 160 | const ScalarType *src, size_t src_stride, | ||
| 161 | ScalarType *dst, size_t dst_stride, | ||
| 162 | size_t width, size_t height) { | ||
| 163 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 85 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 85 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 85 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 85 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 85 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 85 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 85 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 85 times.
|
356 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 164 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 81 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 81 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 81 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 81 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 81 times.
|
340 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 165 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 73 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 73 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 73 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 73 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 77 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 73 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 73 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 77 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 73 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 73 times.
|
324 | CHECK_IMAGE_SIZE(width, height); |
| 166 | |||
| 167 | 292 | Rectangle rect{width, height}; | |
| 168 | 292 | Rows src_rows{src, src_stride, operation.input_channels()}; | |
| 169 | 292 | Rows dst_rows{dst, dst_stride, 3}; | |
| 170 | |||
| 171 | 292 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 172 | 292 | return KLEIDICV_OK; | |
| 173 | 356 | } | |
| 174 | |||
| 175 | using RGBToYUV = RGBToYUVAll<false, false>; | ||
| 176 | using RGBAToYUV = RGBToYUVAll<false, true>; | ||
| 177 | using BGRToYUV = RGBToYUVAll<true, false>; | ||
| 178 | using BGRAToYUV = RGBToYUVAll<true, true>; | ||
| 179 | |||
| 180 | KLEIDICV_TARGET_FN_ATTRS | ||
| 181 | 89 | kleidicv_error_t rgb_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
| 182 | uint8_t *dst, size_t dst_stride, size_t width, | ||
| 183 | size_t height) { | ||
| 184 | 89 | RGBToYUV operation; | |
| 185 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 186 | 89 | height); | |
| 187 | 89 | } | |
| 188 | |||
| 189 | KLEIDICV_TARGET_FN_ATTRS | ||
| 190 | 89 | kleidicv_error_t rgba_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
| 191 | uint8_t *dst, size_t dst_stride, size_t width, | ||
| 192 | size_t height) { | ||
| 193 | 89 | RGBAToYUV operation; | |
| 194 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 195 | 89 | height); | |
| 196 | 89 | } | |
| 197 | |||
| 198 | KLEIDICV_TARGET_FN_ATTRS | ||
| 199 | 89 | kleidicv_error_t bgr_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
| 200 | uint8_t *dst, size_t dst_stride, size_t width, | ||
| 201 | size_t height) { | ||
| 202 | 89 | BGRToYUV operation; | |
| 203 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 204 | 89 | height); | |
| 205 | 89 | } | |
| 206 | |||
| 207 | KLEIDICV_TARGET_FN_ATTRS | ||
| 208 | 89 | kleidicv_error_t bgra_to_yuv_u8(const uint8_t *src, size_t src_stride, | |
| 209 | uint8_t *dst, size_t dst_stride, size_t width, | ||
| 210 | size_t height) { | ||
| 211 | 89 | BGRAToYUV operation; | |
| 212 | 267 | return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, | |
| 213 | 89 | height); | |
| 214 | 89 | } | |
| 215 | |||
| 216 | } // namespace kleidicv::neon | ||
| 217 |