| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_YUV420_TO_RGB_NEON_H | ||
| 6 | #define KLEIDICV_YUV420_TO_RGB_NEON_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | #include <utility> | ||
| 10 | |||
| 11 | #include "kleidicv/kleidicv.h" | ||
| 12 | #include "kleidicv/traits.h" | ||
| 13 | #include "yuv420_coefficients.h" | ||
| 14 | |||
| 15 | namespace kleidicv::neon { | ||
| 16 | |||
| 17 | template <bool BGR, bool kAlpha> | ||
| 18 | class YUV420XToRGBxOrBGRx { | ||
| 19 | public: | ||
| 20 | using ScalarType = uint8_t; | ||
| 21 | using VectorType = uint8x16_t; | ||
| 22 | |||
| 23 | int32x4_t y_weight_; | ||
| 24 | int32x2x2_t uv_weights_; | ||
| 25 | int32x4_t r_base_, g_base_, b_base_; | ||
| 26 | int8x16x4_t de_interleave_indices_; | ||
| 27 | const bool v_first_; | ||
| 28 | |||
| 29 | // Returns the number of channels in the output image. | ||
| 30 | 620 | static constexpr size_t output_channels() { | |
| 31 | 620 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
| 32 | } | ||
| 33 | |||
| 34 | 4608 | static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) { | |
| 35 | 9216 | return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)), | |
| 36 | 4608 | vmovn_s32(vshrq_n_s32(b, kWeightScale))); | |
| 37 | } | ||
| 38 | |||
| 39 | // clang-format off | ||
| 40 | |||
| 41 | static constexpr int8_t kDeInterleaveTableIndices[64] = { | ||
| 42 | /* low and even */ | ||
| 43 | 0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1, | ||
| 44 | /* high and even */ | ||
| 45 | 8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1, | ||
| 46 | /* low and odd */ | ||
| 47 | 1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1, | ||
| 48 | /* high and odd */ | ||
| 49 | 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1, | ||
| 50 | }; | ||
| 51 | |||
| 52 | // clang-format on | ||
| 53 | |||
| 54 | 884 | explicit YUV420XToRGBxOrBGRx(bool v_first) | |
| 55 | 884 | : y_weight_{vdupq_n_s32(kYWeight)}, | |
| 56 | 884 | uv_weights_{vld2_s32(kUVWeights)}, | |
| 57 | 1768 | r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - | |
| 58 | 884 | 128 * kUVWeights[kRVWeightIndex])}, | |
| 59 | 1768 | g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - | |
| 60 | 884 | 128 * (kUVWeights[1] + kUVWeights[2]))}, | |
| 61 | 884 | b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])}, | |
| 62 | 884 | de_interleave_indices_{}, | |
| 63 | 884 | v_first_{v_first} { | |
| 64 | 884 | neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices, | |
| 65 | 884 | de_interleave_indices_); | |
| 66 | 884 | } | |
| 67 | |||
| 68 | 384 | void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l, | |
| 69 | int32x4_t u_h, int32x4_t v_l, int32x4_t v_h, | ||
| 70 | ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) { | ||
| 71 | // Y' = saturating(Ya - 16) and widen to 32-bits. | ||
| 72 | 384 | uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16)); | |
| 73 | 384 | uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16)); | |
| 74 | |||
| 75 | 768 | uint32x4_t y0_m16_even_l = | |
| 76 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0])); | |
| 77 | 768 | uint32x4_t y0_m16_even_h = | |
| 78 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1])); | |
| 79 | 768 | uint32x4_t y0_m16_odd_l = | |
| 80 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2])); | |
| 81 | 768 | uint32x4_t y0_m16_odd_h = | |
| 82 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3])); | |
| 83 | |||
| 84 | 768 | uint32x4_t y1_m16_even_l = | |
| 85 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0])); | |
| 86 | 768 | uint32x4_t y1_m16_even_h = | |
| 87 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1])); | |
| 88 | 768 | uint32x4_t y1_m16_odd_l = | |
| 89 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2])); | |
| 90 | 768 | uint32x4_t y1_m16_odd_h = | |
| 91 | 384 | vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3])); | |
| 92 | |||
| 93 | // Y = Weight(Y) * Y' | ||
| 94 | 384 | y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_); | |
| 95 | 384 | y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_); | |
| 96 | 384 | y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_); | |
| 97 | 384 | y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_); | |
| 98 | |||
| 99 | 384 | y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_); | |
| 100 | 384 | y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_); | |
| 101 | 384 | y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_); | |
| 102 | 384 | y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_); | |
| 103 | |||
| 104 | // Swap U and V planes for YV12 layout. | ||
| 105 |
8/8✓ Branch 0 taken 48 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 48 times.
✓ Branch 3 taken 48 times.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 48 times.
✓ Branch 7 taken 48 times.
|
384 | if (v_first_) { |
| 106 | 192 | std::swap(u_l, v_l); | |
| 107 | 192 | std::swap(u_h, v_h); | |
| 108 | 192 | } | |
| 109 | |||
| 110 | // R - Y = Rbase + Weight(RV) * V = | ||
| 111 | // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V | ||
| 112 | 384 | int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0); | |
| 113 | 384 | int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0); | |
| 114 | |||
| 115 | // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V = | ||
| 116 | // Weight(GU) * ((1 << (SCALE - 1)) - 128) + | ||
| 117 | // Weight(GV) * ((1 << (SCALE - 1)) - 128) + | ||
| 118 | // Weight(GU) * U + Weight(GV) * V | ||
| 119 | 384 | int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0); | |
| 120 | 384 | int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0); | |
| 121 | 384 | g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1); | |
| 122 | 384 | g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1); | |
| 123 | |||
| 124 | // B - Y = Bbase + Weight(BU) * U = | ||
| 125 | // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U | ||
| 126 | 384 | int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1); | |
| 127 | 384 | int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1); | |
| 128 | |||
| 129 | // R = (R - Y) + Y | ||
| 130 | 384 | int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l); | |
| 131 | 384 | int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h); | |
| 132 | 384 | int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l); | |
| 133 | 384 | int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h); | |
| 134 | 384 | int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h); | |
| 135 | 384 | int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h); | |
| 136 | |||
| 137 | 384 | int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l); | |
| 138 | 384 | int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h); | |
| 139 | 384 | int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l); | |
| 140 | 384 | int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h); | |
| 141 | 384 | int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h); | |
| 142 | 384 | int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h); | |
| 143 | |||
| 144 | // G = (G - Y) + Y | ||
| 145 | 384 | int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l); | |
| 146 | 384 | int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h); | |
| 147 | 384 | int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l); | |
| 148 | 384 | int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h); | |
| 149 | 384 | int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h); | |
| 150 | 384 | int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h); | |
| 151 | |||
| 152 | 384 | int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l); | |
| 153 | 384 | int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h); | |
| 154 | 384 | int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l); | |
| 155 | 384 | int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h); | |
| 156 | 384 | int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h); | |
| 157 | 384 | int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h); | |
| 158 | |||
| 159 | // B = (B - Y) + Y | ||
| 160 | 384 | int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l); | |
| 161 | 384 | int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h); | |
| 162 | 384 | int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l); | |
| 163 | 384 | int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h); | |
| 164 | 384 | int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h); | |
| 165 | 384 | int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h); | |
| 166 | |||
| 167 | 384 | int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l); | |
| 168 | 384 | int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h); | |
| 169 | 384 | int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l); | |
| 170 | 384 | int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h); | |
| 171 | 384 | int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h); | |
| 172 | 384 | int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h); | |
| 173 | |||
| 174 | // Zip even and odd RGB pixels. | ||
| 175 | 384 | uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); | |
| 176 | 384 | uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); | |
| 177 | 384 | uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); | |
| 178 | 384 | uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); | |
| 179 | 384 | uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); | |
| 180 | 384 | uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); | |
| 181 | |||
| 182 | if constexpr (kAlpha) { | ||
| 183 | 192 | uint8x16x4_t rgba0, rgba1; | |
| 184 | // Red channel | ||
| 185 | 192 | rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); | |
| 186 | 192 | rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); | |
| 187 | // Green channel | ||
| 188 | 192 | rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); | |
| 189 | 192 | rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); | |
| 190 | // Blue channel | ||
| 191 | 192 | rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); | |
| 192 | 192 | rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); | |
| 193 | // Alpha channel | ||
| 194 | 192 | rgba0.val[3] = vdupq_n_u8(0xFF); | |
| 195 | 192 | rgba1.val[3] = vdupq_n_u8(0xFF); | |
| 196 | |||
| 197 | if constexpr (BGR) { | ||
| 198 | 96 | std::swap(rgba0.val[0], rgba0.val[2]); | |
| 199 | 96 | std::swap(rgba1.val[0], rgba1.val[2]); | |
| 200 | } | ||
| 201 | |||
| 202 | // Store RGB pixels to memory. | ||
| 203 | 192 | vst4q_u8(rgbx_row_0, rgba0); | |
| 204 | 192 | vst4q_u8(rgbx_row_1, rgba1); | |
| 205 | 192 | } else { | |
| 206 | 192 | uint8x16x3_t rgb0, rgb1; | |
| 207 | // Red channel | ||
| 208 | 192 | rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); | |
| 209 | 192 | rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); | |
| 210 | // Green channel | ||
| 211 | 192 | rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); | |
| 212 | 192 | rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); | |
| 213 | // Blue channel | ||
| 214 | 192 | rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); | |
| 215 | 192 | rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); | |
| 216 | |||
| 217 | if constexpr (BGR) { | ||
| 218 | 96 | std::swap(rgb0.val[0], rgb0.val[2]); | |
| 219 | 96 | std::swap(rgb1.val[0], rgb1.val[2]); | |
| 220 | } | ||
| 221 | |||
| 222 | // Store RGB pixels to memory. | ||
| 223 | 192 | vst3q_u8(rgbx_row_0, rgb0); | |
| 224 | 192 | vst3q_u8(rgbx_row_1, rgb1); | |
| 225 | 192 | } | |
| 226 | 384 | } | |
| 227 | |||
| 228 | 22984 | void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128, | |
| 229 | int32_t v_m128, uint8_t *rgbx_rows[2]) { | ||
| 230 |
8/8✓ Branch 0 taken 5746 times.
✓ Branch 1 taken 11492 times.
✓ Branch 2 taken 5746 times.
✓ Branch 3 taken 11492 times.
✓ Branch 4 taken 5746 times.
✓ Branch 5 taken 11492 times.
✓ Branch 6 taken 5746 times.
✓ Branch 7 taken 11492 times.
|
68952 | for (size_t selector = 0; selector < 2; ++selector) { |
| 231 | 45968 | int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0); | |
| 232 | 45968 | int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128; | |
| 233 | 91936 | int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 + | |
| 234 | 45968 | kUVWeights[kGVWeightIndex] * v_m128; | |
| 235 | 45968 | int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128; | |
| 236 | |||
| 237 | 45968 | r = rounding_shift_right(r, kWeightScale); | |
| 238 | 45968 | g = rounding_shift_right(g, kWeightScale); | |
| 239 | 45968 | b = rounding_shift_right(b, kWeightScale); | |
| 240 | |||
| 241 | if constexpr (BGR) { | ||
| 242 | 22984 | std::swap(r, b); | |
| 243 | } | ||
| 244 | |||
| 245 | 45968 | rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r); | |
| 246 | 45968 | rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g); | |
| 247 | 45968 | rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b); | |
| 248 | |||
| 249 | if constexpr (kAlpha) { | ||
| 250 | 22984 | rgbx_rows[selector][3] = 0xFF; | |
| 251 | } | ||
| 252 | |||
| 253 | 45968 | rgbx_rows[selector] += kAlpha ? 4 : 3; | |
| 254 | 45968 | } | |
| 255 | 22984 | } | |
| 256 | }; | ||
| 257 | } // namespace kleidicv::neon | ||
| 258 | |||
| 259 | #endif // KLEIDICV_YUV420_TO_RGB_NEON_H | ||
| 260 |