| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_RGB_TO_YUV420_H | ||
| 6 | #define KLEIDICV_RGB_TO_YUV420_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | #include <utility> | ||
| 10 | |||
| 11 | #include "kleidicv/kleidicv.h" | ||
| 12 | #include "kleidicv/neon.h" | ||
| 13 | #include "yuv420_coefficients.h" | ||
| 14 | |||
| 15 | namespace kleidicv::neon { | ||
| 16 | |||
| 17 | template <bool kAlpha, bool RGB, bool kInterleave> | ||
| 18 | class RGBxorBGRxToYUV420 { | ||
| 19 | public: | ||
| 20 | 376 | static kleidicv_error_t rgb2yuv420_operation( | |
| 21 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
| 22 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
| 23 | bool v_first, size_t begin, size_t end) { | ||
| 24 | 376 | size_t row_begin = begin * 2; | |
| 25 | 376 | size_t row_end = std::min<size_t>(height, end * 2); | |
| 26 | |||
| 27 | 376 | const uint8_t *src_row = nullptr; | |
| 28 | 376 | uint8_t *y_row = nullptr; | |
| 29 | 376 | uint8_t *u_row = nullptr; | |
| 30 | 376 | uint8_t *v_row = nullptr; | |
| 31 |
8/8✓ Branch 0 taken 94 times.
✓ Branch 1 taken 4810 times.
✓ Branch 2 taken 94 times.
✓ Branch 3 taken 4810 times.
✓ Branch 4 taken 94 times.
✓ Branch 5 taken 4810 times.
✓ Branch 6 taken 94 times.
✓ Branch 7 taken 4810 times.
|
19616 | for (size_t h = row_begin; h < row_end; h++) { |
| 32 | 19240 | src_row = src + src_stride * h; | |
| 33 | 19240 | y_row = y_dst + y_stride * h; | |
| 34 | |||
| 35 | 19240 | bool evenRow = (h & 1) == 0; | |
| 36 | |||
| 37 |
8/8✓ Branch 0 taken 2382 times.
✓ Branch 1 taken 2428 times.
✓ Branch 2 taken 2382 times.
✓ Branch 3 taken 2428 times.
✓ Branch 4 taken 2382 times.
✓ Branch 5 taken 2428 times.
✓ Branch 6 taken 2382 times.
✓ Branch 7 taken 2428 times.
|
19240 | if (evenRow) { |
| 38 | if constexpr (kInterleave) { | ||
| 39 | 4912 | u_row = uv_dst + uv_stride * (h / 2); | |
| 40 | } else { | ||
| 41 | 4800 | u_row = | |
| 42 | 4800 | uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2); | |
| 43 | // Pointer to the start of the V plane. | ||
| 44 | // The V plane follows the U plane. Both U and V planes are | ||
| 45 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 | ||
| 46 | // rows) and stored in a single contiguous chroma region. | ||
| 47 | // Depending on image height and stride, the starting offset | ||
| 48 | // of V may require adjustment, so a | ||
| 49 | // fractional offset (in rows) is applied to calculate the V plane | ||
| 50 | // position. | ||
| 51 | 9600 | v_row = uv_dst + uv_stride * ((h + height + 1) / 4) + | |
| 52 | 4800 | (((h + height + 1) / 2) % 2) * ((width + 1) / 2); | |
| 53 | } | ||
| 54 | 9712 | } | |
| 55 | |||
| 56 | 19240 | LoopUnroll2<TryToAvoidTailLoop> loop{width, kVectorLength}; | |
| 57 | 30040 | loop.unroll_twice([&](size_t index) { | |
| 58 | 10800 | vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow); | |
| 59 | 10800 | }); | |
| 60 | |||
| 61 | 77552 | loop.tail([&](size_t index) { | |
| 62 | 116624 | scalar_path(src_row, y_row, u_row, v_row, v_first, index, width, | |
| 63 | 58312 | evenRow); | |
| 64 | 58312 | }); | |
| 65 | 19240 | } | |
| 66 | |||
| 67 | 376 | return KLEIDICV_OK; | |
| 68 | 376 | } | |
| 69 | |||
| 70 | private: | ||
| 71 | 10800 | static void vector_path_2x(const uint8_t *src_row, uint8_t *y_row, | |
| 72 | uint8_t *u_row, uint8_t *v_row, const bool v_first, | ||
| 73 | const size_t index, const bool evenRow) { | ||
| 74 | 10800 | uint32x4_t r0[4], g0[4], b0[4], r1[4], g1[4], b1[4]; | |
| 75 | |||
| 76 | 10800 | load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, index); | |
| 77 | |||
| 78 | 10800 | uint8x16_t y0 = rgb_to_y(r0, g0, b0); | |
| 79 | |||
| 80 | 10800 | uint8x16_t y1 = rgb_to_y(r1, g1, b1); | |
| 81 | |||
| 82 | 10800 | vst1q_u8(y_row + index, y0); | |
| 83 | 10800 | vst1q_u8(y_row + index + kVectorLength, y1); | |
| 84 | |||
| 85 | // U and V are subsampled by a factor of 2 in both horizontal and vertical | ||
| 86 | // directions for YUV420 format. Therefore, we only compute U and V from | ||
| 87 | // even rows and even columns. When the input RGB image has an odd width or | ||
| 88 | // height, the chroma (U and V) dimensions are rounded up. For example, if | ||
| 89 | // the height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 | ||
| 90 | // = 4.5 -> rounded up). The same rounding is applied for width. | ||
| 91 |
8/8✓ Branch 0 taken 1300 times.
✓ Branch 1 taken 1400 times.
✓ Branch 2 taken 1300 times.
✓ Branch 3 taken 1400 times.
✓ Branch 4 taken 1300 times.
✓ Branch 5 taken 1400 times.
✓ Branch 6 taken 1300 times.
✓ Branch 7 taken 1400 times.
|
10800 | if (evenRow) { |
| 92 | 5600 | uint8x16x2_t uv; | |
| 93 | 5600 | int32x4_t r_even[4] = {r0[0], r0[2], r1[0], r1[2]}; | |
| 94 | 5600 | int32x4_t g_even[4] = {g0[0], g0[2], g1[0], g1[2]}; | |
| 95 | 5600 | int32x4_t b_even[4] = {b0[0], b0[2], b1[0], b1[2]}; | |
| 96 | 5600 | rgb_to_uv_2x(r_even, g_even, b_even, uv.val[0], uv.val[1]); | |
| 97 | |||
| 98 |
8/8✓ Branch 0 taken 700 times.
✓ Branch 1 taken 700 times.
✓ Branch 2 taken 700 times.
✓ Branch 3 taken 700 times.
✓ Branch 4 taken 700 times.
✓ Branch 5 taken 700 times.
✓ Branch 6 taken 700 times.
✓ Branch 7 taken 700 times.
|
5600 | if (v_first) { |
| 99 | 2800 | std::swap(uv.val[0], uv.val[1]); | |
| 100 | 2800 | } | |
| 101 | |||
| 102 | if constexpr (kInterleave) { | ||
| 103 | 2800 | vst2q_u8(u_row + index, uv); | |
| 104 | } else { | ||
| 105 | 2800 | vst1q_u8(u_row + index / 2, uv.val[0]); | |
| 106 | 2800 | vst1q_u8(v_row + index / 2, uv.val[1]); | |
| 107 | } | ||
| 108 | 5600 | } | |
| 109 | 10800 | } | |
| 110 | |||
| 111 | 58312 | static void scalar_path(const uint8_t *src_row, uint8_t *y_row, | |
| 112 | uint8_t *u_row, uint8_t *v_row, const bool v_first, | ||
| 113 | size_t index, const size_t length, | ||
| 114 | const bool evenRow) { | ||
| 115 | 58312 | const size_t u_index_ = v_first; | |
| 116 | 58312 | const size_t v_index_ = !v_first; | |
| 117 | |||
| 118 |
8/8✓ Branch 0 taken 65246 times.
✓ Branch 1 taken 14578 times.
✓ Branch 2 taken 65246 times.
✓ Branch 3 taken 14578 times.
✓ Branch 4 taken 65246 times.
✓ Branch 5 taken 14578 times.
✓ Branch 6 taken 65246 times.
✓ Branch 7 taken 14578 times.
|
319296 | for (; index < length; index += 1) { |
| 119 | 260984 | uint8_t b0{}, g0{}, r0{}; | |
| 120 | 260984 | bool evenCol = (index & 1) == 0; | |
| 121 | 260984 | b0 = src_row[index * scn + b_index_]; | |
| 122 | 260984 | g0 = src_row[index * scn + g_index_]; | |
| 123 | 260984 | r0 = src_row[index * scn + r_index_]; | |
| 124 | |||
| 125 | 260984 | uint8_t y0 = rgb_to_y(r0, g0, b0); | |
| 126 | 260984 | y_row[index] = y0; | |
| 127 | |||
| 128 | // U and V are subsampled by a factor of 2 in both horizontal and vertical | ||
| 129 | // directions | ||
| 130 | // for YUV420 format. Therefore, we only compute U and V from even rows | ||
| 131 | // and even columns. When the input RGB image has an odd width or height, | ||
| 132 | // the chroma (U and V) dimensions are rounded up. For example, if the | ||
| 133 | // height is 9, Y will be 9 rows, but U and V will be 5 rows (9 / 2 = 4.5 | ||
| 134 | // -> rounded up). The same rounding is applied for width. | ||
| 135 |
16/16✓ Branch 0 taken 33384 times.
✓ Branch 1 taken 31862 times.
✓ Branch 2 taken 18092 times.
✓ Branch 3 taken 15292 times.
✓ Branch 4 taken 33384 times.
✓ Branch 5 taken 31862 times.
✓ Branch 6 taken 18092 times.
✓ Branch 7 taken 15292 times.
✓ Branch 8 taken 33384 times.
✓ Branch 9 taken 31862 times.
✓ Branch 10 taken 18092 times.
✓ Branch 11 taken 15292 times.
✓ Branch 12 taken 33384 times.
✓ Branch 13 taken 31862 times.
✓ Branch 14 taken 18092 times.
✓ Branch 15 taken 15292 times.
|
260984 | if (evenRow && evenCol) { |
| 136 | 61168 | uint8_t uv[2] = {0, 0}; | |
| 137 | 61168 | rgb_to_uv(r0, g0, b0, uv); | |
| 138 | if constexpr (kInterleave) { | ||
| 139 | 37248 | u_row[index] = uv[u_index_]; | |
| 140 | 37248 | u_row[index + 1] = uv[v_index_]; | |
| 141 | } else { | ||
| 142 | 23920 | u_row[(index + 1) / 2] = uv[u_index_]; | |
| 143 | 23920 | v_row[(index + 1) / 2] = uv[v_index_]; | |
| 144 | } | ||
| 145 | 61168 | } | |
| 146 | 260984 | } | |
| 147 | 58312 | } | |
| 148 | |||
| 149 | 260984 | static uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) { | |
| 150 | 260984 | const int kShifted16 = (16 << kWeightScale); | |
| 151 | 260984 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
| 152 | 521968 | int yy = | |
| 153 | 260984 | kRYWeight * r + kGYWeight * g + kBYWeight * b + kHalfShift + kShifted16; | |
| 154 | |||
| 155 | 521968 | return std::clamp(yy >> kWeightScale, 0, 0xff); | |
| 156 | 260984 | } | |
| 157 | |||
| 158 | 21600 | static uint8x16_t rgb_to_y(const uint32x4_t r[4], const uint32x4_t g[4], | |
| 159 | const uint32x4_t b[4]) { | ||
| 160 | 21600 | const int kShifted16 = (16 << kWeightScale); | |
| 161 | 21600 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
| 162 | |||
| 163 | // Y = kR*R + kG*G + kB*B + rounding bias | ||
| 164 | 21600 | uint32x4_t v_kRYWeight = vdupq_n_u32(kRYWeight); | |
| 165 | 21600 | uint32x4_t v_kGYWeight = vdupq_n_u32(kGYWeight); | |
| 166 | 21600 | uint32x4_t v_kBYWeight = vdupq_n_u32(kBYWeight); | |
| 167 | 21600 | uint32x4_t y[4]; | |
| 168 | |||
| 169 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 170 |
8/8✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 21600 times.
✓ Branch 2 taken 5400 times.
✓ Branch 3 taken 21600 times.
✓ Branch 4 taken 5400 times.
✓ Branch 5 taken 21600 times.
✓ Branch 6 taken 5400 times.
✓ Branch 7 taken 21600 times.
|
108000 | for (int i = 0; i < 4; i++) { |
| 171 | 86400 | y[i] = vdupq_n_u32(kHalfShift + kShifted16); | |
| 172 | 86400 | y[i] = vmlaq_u32(y[i], r[i], v_kRYWeight); | |
| 173 | 86400 | y[i] = vmlaq_u32(y[i], g[i], v_kGYWeight); | |
| 174 | 86400 | y[i] = vmlaq_u32(y[i], b[i], v_kBYWeight); | |
| 175 | 86400 | } | |
| 176 | |||
| 177 | 43200 | return normalize_and_pack_y(y); | |
| 178 | 21600 | } | |
| 179 | |||
| 180 | 61168 | static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t uv[2]) { | |
| 181 | 61168 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
| 182 | 61168 | const int kShifted128 = (128 << kWeightScale); | |
| 183 | 61168 | int uu = kRUWeight * r + kGUWeight * g + kBUWeight * b + kHalfShift + | |
| 184 | kShifted128; | ||
| 185 | 61168 | int vv = kBUWeight * r + kGVWeight * g + kBVWeight * b + kHalfShift + | |
| 186 | kShifted128; | ||
| 187 | |||
| 188 | 61168 | uv[0] = std::clamp(uu >> kWeightScale, 0, 0xff); | |
| 189 | 61168 | uv[1] = std::clamp(vv >> kWeightScale, 0, 0xff); | |
| 190 | 61168 | } | |
| 191 | |||
| 192 | 11200 | static uint8x16_t compute_u_or_v_2x(const int32x4_t r[4], | |
| 193 | const int32x4_t g[4], | ||
| 194 | const int32x4_t b[4], const int r_coeff, | ||
| 195 | const int g_coeff, const int b_coeff) { | ||
| 196 | // Constants for U/V calculation | ||
| 197 | 11200 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
| 198 | 11200 | const int kShifted128 = (128 << kWeightScale); | |
| 199 | |||
| 200 | 11200 | int32x4_t v_r_coeff = vdupq_n_s32(r_coeff); | |
| 201 | 11200 | int32x4_t v_g_coeff = vdupq_n_s32(g_coeff); | |
| 202 | 11200 | int32x4_t v_b_coeff = vdupq_n_s32(b_coeff); | |
| 203 | 11200 | int32x4_t uv[4]; | |
| 204 | |||
| 205 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 206 |
8/8✓ Branch 0 taken 2800 times.
✓ Branch 1 taken 11200 times.
✓ Branch 2 taken 2800 times.
✓ Branch 3 taken 11200 times.
✓ Branch 4 taken 2800 times.
✓ Branch 5 taken 11200 times.
✓ Branch 6 taken 2800 times.
✓ Branch 7 taken 11200 times.
|
56000 | for (int i = 0; i < 4; i++) { |
| 207 | 44800 | uv[i] = vdupq_n_s32(kHalfShift + kShifted128); | |
| 208 | 44800 | uv[i] = vmlaq_s32(uv[i], r[i], v_r_coeff); | |
| 209 | 44800 | uv[i] = vmlaq_s32(uv[i], g[i], v_g_coeff); | |
| 210 | 44800 | uv[i] = vmlaq_s32(uv[i], b[i], v_b_coeff); | |
| 211 | 44800 | } | |
| 212 | |||
| 213 | 22400 | return normalize_and_pack_u_or_v(uv); | |
| 214 | 11200 | } | |
| 215 | |||
| 216 | 5600 | static void rgb_to_uv_2x(const int32x4_t r[4], const int32x4_t g[4], | |
| 217 | const int32x4_t b[4], uint8x16_t &u, uint8x16_t &v) { | ||
| 218 | // ---------------- U (Cb) Component ---------------- | ||
| 219 | // U = R * kRU + G * kGU + B * kBU + bias | ||
| 220 | 5600 | u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight); | |
| 221 | |||
| 222 | // ---------------- V (Cr) Component ---------------- | ||
| 223 | // V = R * kBU + G * kGV + B * kBV + bias | ||
| 224 | 5600 | v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight); | |
| 225 | 5600 | } | |
| 226 | |||
| 227 | 21600 | static uint8x16_t normalize_and_pack_y(uint32x4_t vec[4]) { | |
| 228 | // The y_index table selects the correct output order after normalization. | ||
| 229 | // When we load and separate the RGB values for UV calculation, we | ||
| 230 | // deinterleave them into even and odd components. As a result, the | ||
| 231 | // processed values are stored in two separate vectors. During | ||
| 232 | // normalization, we need to interleave them again to produce the final | ||
| 233 | // contiguous output, and this index pattern achieves that. | ||
| 234 | 21600 | uint8x16_t y_index = {1, 17, 3, 19, 5, 21, 7, 23, | |
| 235 | 9, 25, 11, 27, 13, 29, 15, 31}; | ||
| 236 | |||
| 237 | // Normalize down by right-shifting the fixed-point result | ||
| 238 | // vshrn_n can only shift by an immediate value between 1 and 16. | ||
| 239 | // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12 | ||
| 240 | // bits. This ensures that the most relevant 8-bit result lies in the second | ||
| 241 | // byte of each 16-bit element. As a result, the lookup tables are | ||
| 242 | // constructed with only odd indices to extract the second byte from each | ||
| 243 | // element. | ||
| 244 | 21600 | uint16x4_t tmp_lo_lo = vshrn_n_u32(vec[0], kWeightScale - 8); | |
| 245 | 43200 | uint16x8_t tmp_lo_hi = | |
| 246 | 21600 | vshrn_high_n_u32(tmp_lo_lo, vec[2], kWeightScale - 8); | |
| 247 | 21600 | uint16x4_t tmp_hi_lo = vshrn_n_u32(vec[1], kWeightScale - 8); | |
| 248 | 43200 | uint16x8_t tmp_hi_hi = | |
| 249 | 21600 | vshrn_high_n_u32(tmp_hi_lo, vec[3], kWeightScale - 8); | |
| 250 | |||
| 251 | 21600 | uint8x16x2_t tmp; | |
| 252 | 21600 | tmp.val[0] = vreinterpretq_u8(tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14 | |
| 253 | 21600 | tmp.val[1] = vreinterpretq_u8(tmp_hi_hi); // 1, 3, 5, 7, 9, 11, 13, 15 | |
| 254 | |||
| 255 | 21600 | uint8x16_t output = vqtbl2q_u8(tmp, y_index); | |
| 256 | |||
| 257 | 43200 | return output; | |
| 258 | 21600 | } | |
| 259 | |||
| 260 | 11200 | static uint8x16_t normalize_and_pack_u_or_v(int32x4_t vec[4]) { | |
| 261 | // The uv_index table is used to finalize the order of U and V values. | ||
| 262 | // Unlike the Y component, we don't need to interleave even and odd elements | ||
| 263 | // manually. This is because the first vector already contains even-indexed | ||
| 264 | // values from the lower RGB block, and the second vector contains | ||
| 265 | // even-indexed values from the higher RGB block. As a result, the values | ||
| 266 | // are already sorted in the correct order for output. | ||
| 267 | 11200 | uint8x16_t uv_index = {1, 3, 5, 7, 9, 11, 13, 15, | |
| 268 | 17, 19, 21, 23, 25, 27, 29, 31}; | ||
| 269 | |||
| 270 | // Normalize down by right-shifting the fixed-point result | ||
| 271 | // vshrn_n can only shift by an immediate value between 1 and 16. | ||
| 272 | // Since kWeightScale is 20, we use (kWeightScale - 8) to shift down to 12 | ||
| 273 | // bits. This ensures that the most relevant 8-bit result lies in the second | ||
| 274 | // byte of each 16-bit element. As a result, the lookup tables are | ||
| 275 | // constructed with only odd indices to extract the second byte from each | ||
| 276 | // element. | ||
| 277 | 11200 | int16x4_t tmp_lo_lo = vshrn_n_s32(vec[0], kWeightScale - 8); | |
| 278 | 11200 | int16x8_t tmp_lo_hi = vshrn_high_n_s32(tmp_lo_lo, vec[1], kWeightScale - 8); | |
| 279 | 11200 | int16x4_t tmp_hi_lo = vshrn_n_s32(vec[2], kWeightScale - 8); | |
| 280 | 11200 | int16x8_t tmp_hi_hi = vshrn_high_n_s32(tmp_hi_lo, vec[3], kWeightScale - 8); | |
| 281 | |||
| 282 | 11200 | uint8x16x2_t tmp; | |
| 283 | 11200 | tmp.val[0] = vreinterpretq_u8( | |
| 284 | 11200 | tmp_lo_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the first vector | |
| 285 | 11200 | tmp.val[1] = vreinterpretq_u8( | |
| 286 | 11200 | tmp_hi_hi); // 0, 2, 4, 6, 8, 10, 12, 14 for the second vector | |
| 287 | 11200 | uint8x16_t output = vqtbl2q_u8(tmp, uv_index); | |
| 288 | |||
| 289 | 22400 | return output; | |
| 290 | 11200 | } | |
| 291 | |||
| 292 | 10800 | static void load_rgb_2x(uint32x4_t r0[4], uint32x4_t g0[4], uint32x4_t b0[4], | |
| 293 | uint32x4_t r1[4], uint32x4_t g1[4], uint32x4_t b1[4], | ||
| 294 | const uint8_t *src_row, const size_t index) { | ||
| 295 | 10800 | uint8x16_t tmp_b0, tmp_b1, tmp_g0, tmp_g1, tmp_r0, tmp_r1; | |
| 296 | // Load 32 pixels: two vectors of interleaved channels | ||
| 297 | |||
| 298 | if constexpr (kAlpha) { | ||
| 299 | // 4-channel input (RGBA or BGRA) | ||
| 300 | 5400 | uint8x16x4_t vsrc0 = vld4q_u8(src_row + scn * index); | |
| 301 | 5400 | uint8x16x4_t vsrc1 = | |
| 302 | 5400 | vld4q_u8(src_row + scn * index + scn * kVectorLength); | |
| 303 | |||
| 304 | 5400 | tmp_b0 = vsrc0.val[b_index_]; | |
| 305 | 5400 | tmp_g0 = vsrc0.val[g_index_]; | |
| 306 | 5400 | tmp_r0 = vsrc0.val[r_index_]; | |
| 307 | |||
| 308 | 5400 | tmp_b1 = vsrc1.val[b_index_]; | |
| 309 | 5400 | tmp_g1 = vsrc1.val[g_index_]; | |
| 310 | 5400 | tmp_r1 = vsrc1.val[r_index_]; | |
| 311 | 5400 | } else { | |
| 312 | // 3-channel input (RGB or BGR) | ||
| 313 | 5400 | uint8x16x3_t vsrc0 = vld3q_u8(src_row + scn * index); | |
| 314 | 5400 | uint8x16x3_t vsrc1 = | |
| 315 | 5400 | vld3q_u8(src_row + scn * index + scn * kVectorLength); | |
| 316 | |||
| 317 | 5400 | tmp_b0 = vsrc0.val[b_index_]; | |
| 318 | 5400 | tmp_g0 = vsrc0.val[g_index_]; | |
| 319 | 5400 | tmp_r0 = vsrc0.val[r_index_]; | |
| 320 | |||
| 321 | 5400 | tmp_b1 = vsrc1.val[b_index_]; | |
| 322 | 5400 | tmp_g1 = vsrc1.val[g_index_]; | |
| 323 | 5400 | tmp_r1 = vsrc1.val[r_index_]; | |
| 324 | 5400 | } | |
| 325 | // After loading the vector, we extend the channels and separate even and | ||
| 326 | // odd elements. This separation is important for UV calculation, as only | ||
| 327 | // the even-indexed values are used. | ||
| 328 | 10800 | uint8x16_t indices[4] = { | |
| 329 | 0, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 4, 0xff, 0xff, | ||
| 330 | 0xff, 6, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 3, 0xff, | ||
| 331 | 0xff, 0xff, 5, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 8, | ||
| 332 | 0xff, 0xff, 0xff, 10, 0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff, | ||
| 333 | 14, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, 11, 0xff, 0xff, | ||
| 334 | 0xff, 13, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff}; | ||
| 335 | |||
| 336 | // Expand each 8-bit channel into 32-bit vectors using table lookup and | ||
| 337 | // reinterpret | ||
| 338 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 339 |
8/8✓ Branch 0 taken 2700 times.
✓ Branch 1 taken 10800 times.
✓ Branch 2 taken 2700 times.
✓ Branch 3 taken 10800 times.
✓ Branch 4 taken 2700 times.
✓ Branch 5 taken 10800 times.
✓ Branch 6 taken 2700 times.
✓ Branch 7 taken 10800 times.
|
54000 | for (int i = 0; i < 4; i++) { |
| 340 | 43200 | r0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r0, indices[i])); | |
| 341 | 43200 | g0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g0, indices[i])); | |
| 342 | 43200 | b0[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b0, indices[i])); | |
| 343 | 43200 | r1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_r1, indices[i])); | |
| 344 | 43200 | g1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_g1, indices[i])); | |
| 345 | 43200 | b1[i] = vreinterpretq_u32_u8(vqtbl1q_u8(tmp_b1, indices[i])); | |
| 346 | 43200 | } | |
| 347 | 10800 | } | |
| 348 | |||
| 349 | static constexpr size_t r_index_ = RGB ? 0 : 2; | ||
| 350 | static constexpr size_t g_index_ = 1; | ||
| 351 | static constexpr size_t b_index_ = RGB ? 2 : 0; | ||
| 352 | static constexpr size_t scn = kAlpha ? 4 : 3; | ||
| 353 | }; | ||
| 354 | |||
| 355 | } // namespace kleidicv::neon | ||
| 356 | |||
| 357 | #endif // KLEIDICV_RGB_TO_YUV420_H | ||
| 358 |