| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_RGB_TO_YUV420_SC_H | ||
| 6 | #define KLEIDICV_RGB_TO_YUV420_SC_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | |||
| 10 | #include "kleidicv/sve2.h" | ||
| 11 | #include "yuv420_coefficients.h" | ||
| 12 | |||
| 13 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 14 | |||
| 15 | template <bool kAlpha, bool RGB, bool kInterleave> | ||
| 16 | class RGBxorBGRxToYUV420 { | ||
| 17 | public: | ||
| 18 | using ArrayOfFour_svuint32 = ScalableVectorArray1D<svuint32_t, 4>; | ||
| 19 | using ArrayOfFour_svint32 = ScalableVectorArray1D<svint32_t, 4>; | ||
| 20 | using ArrayOfTwo_svint32 = ScalableVectorArray1D<svint32_t, 2>; | ||
| 21 | |||
| 22 | 1504 | static kleidicv_error_t rgb2yuv420_operation_sc( | |
| 23 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
| 24 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
| 25 | bool v_first, size_t begin, size_t end) KLEIDICV_STREAMING { | ||
| 26 | 1504 | size_t row_begin = begin * 2; | |
| 27 | 1504 | size_t row_end = std::min<size_t>(height, end * 2); | |
| 28 | 1504 | const uint8_t *src_row = nullptr; | |
| 29 | 1504 | uint8_t *y_row = nullptr; | |
| 30 | 1504 | uint8_t *u_row = nullptr; | |
| 31 | 1504 | uint8_t *v_row = nullptr; | |
| 32 |
8/8✓ Branch 0 taken 376 times.
✓ Branch 1 taken 19240 times.
✓ Branch 2 taken 376 times.
✓ Branch 3 taken 19240 times.
✓ Branch 4 taken 376 times.
✓ Branch 5 taken 19240 times.
✓ Branch 6 taken 376 times.
✓ Branch 7 taken 19240 times.
|
78464 | for (size_t h = row_begin; h < row_end; h++) { |
| 33 | 76960 | src_row = src + src_stride * h; | |
| 34 | 76960 | y_row = y_dst + y_stride * h; | |
| 35 | 76960 | bool evenRow = (h & 1) == 0; | |
| 36 |
8/8✓ Branch 0 taken 9528 times.
✓ Branch 1 taken 9712 times.
✓ Branch 2 taken 9528 times.
✓ Branch 3 taken 9712 times.
✓ Branch 4 taken 9528 times.
✓ Branch 5 taken 9712 times.
✓ Branch 6 taken 9528 times.
✓ Branch 7 taken 9712 times.
|
76960 | if (evenRow) { |
| 37 | if constexpr (kInterleave) { | ||
| 38 | 19648 | u_row = uv_dst + uv_stride * (h / 2); | |
| 39 | } else { | ||
| 40 | 19200 | u_row = | |
| 41 | 19200 | uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * ((width + 1) / 2); | |
| 42 | // Pointer to the start of the V plane. | ||
| 43 | // The V plane follows the U plane. Both U and V planes are | ||
| 44 | // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 | ||
| 45 | // rows), and are often stored in a single contiguous chroma region in | ||
| 46 | // memory. Depending on image height and stride, the starting offset | ||
| 47 | // of V may require adjustment to maintain correct alignment. In | ||
| 48 | // particular, the chroma rows may not align perfectly, so a | ||
| 49 | // fractional offset (in rows) is applied to calculate the V plane | ||
| 50 | // position. The formula used here accounts for this by adjusting | ||
| 51 | // based on row parity, assuming consistent memory layout across the | ||
| 52 | // Y, U, and V planes. | ||
| 53 | 38400 | v_row = uv_dst + uv_stride * ((h + height + 1) / 4) + | |
| 54 | 19200 | (((h + height + 1) / 2) % 2) * ((width + 1) / 2); | |
| 55 | } | ||
| 56 | 38848 | } | |
| 57 | |||
| 58 | 76960 | const size_t kVectorLength = svcntb(); | |
| 59 | 76960 | LoopUnroll2 loop{width, kVectorLength}; | |
| 60 | |||
| 61 | 93376 | loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
| 62 | 16416 | svbool_t pg = svptrue_b8(); | |
| 63 | |||
| 64 | 32832 | vector_path_2x(src_row, y_row, u_row, v_row, v_first, index, evenRow, | |
| 65 | 16416 | pg, pg, pg); | |
| 66 | 16416 | }); | |
| 67 | |||
| 68 | 153424 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
| 69 | 76464 | svbool_t pg = svwhilelt_b8_u64(index, length); | |
| 70 | 76464 | svbool_t pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) / 2); | |
| 71 |
8/8✓ Branch 0 taken 19332 times.
✓ Branch 1 taken 19116 times.
✓ Branch 2 taken 19332 times.
✓ Branch 3 taken 19116 times.
✓ Branch 4 taken 19332 times.
✓ Branch 5 taken 19116 times.
✓ Branch 6 taken 19332 times.
✓ Branch 7 taken 19116 times.
|
153792 | while (svptest_first(svptrue_b8(), pg)) { |
| 72 | 154656 | vector_path(src_row, y_row, u_row, v_row, v_first, index, evenRow, pg, | |
| 73 | 77328 | pg_half); | |
| 74 | 77328 | index += kVectorLength; | |
| 75 | 77328 | pg = svwhilelt_b8_u64(index, length); | |
| 76 | 77328 | pg_half = svwhilelt_b8_u64((index + 1) / 2, (length + 1) / 2); | |
| 77 | } | ||
| 78 | 76464 | }); | |
| 79 | 76960 | } | |
| 80 | 1504 | return KLEIDICV_OK; | |
| 81 | 1504 | } | |
| 82 | |||
| 83 | private: | ||
| 84 | 16416 | static void vector_path_2x(const uint8_t *src_row, uint8_t *y_row, | |
| 85 | uint8_t *u_row, uint8_t *v_row, bool v_first, | ||
| 86 | const size_t index, const bool evenRow, | ||
| 87 | const svbool_t pg0, const svbool_t pg1, | ||
| 88 | const svbool_t pg_half) KLEIDICV_STREAMING { | ||
| 89 | 16416 | svuint32_t r0_0, r0_1, r0_2, r0_3, g0_0, g0_1, g0_2, g0_3, b0_0, b0_1, b0_2, | |
| 90 | b0_3, r1_0, r1_1, r1_2, r1_3, g1_0, g1_1, g1_2, g1_3, b1_0, b1_1, b1_2, | ||
| 91 | b1_3; | ||
| 92 | |||
| 93 | 16416 | ArrayOfFour_svuint32 r0 = { | |
| 94 | 16416 | {std::ref(r0_0), std::ref(r0_1), std::ref(r0_2), std::ref(r0_3)}}; | |
| 95 | 16416 | ArrayOfFour_svuint32 g0 = { | |
| 96 | 16416 | {std::ref(g0_0), std::ref(g0_1), std::ref(g0_2), std::ref(g0_3)}}; | |
| 97 | 16416 | ArrayOfFour_svuint32 b0 = { | |
| 98 | 16416 | {std::ref(b0_0), std::ref(b0_1), std::ref(b0_2), std::ref(b0_3)}}; | |
| 99 | 16416 | ArrayOfFour_svuint32 r1 = { | |
| 100 | 16416 | {std::ref(r1_0), std::ref(r1_1), std::ref(r1_2), std::ref(r1_3)}}; | |
| 101 | 16416 | ArrayOfFour_svuint32 g1 = { | |
| 102 | 16416 | {std::ref(g1_0), std::ref(g1_1), std::ref(g1_2), std::ref(g1_3)}}; | |
| 103 | 16416 | ArrayOfFour_svuint32 b1 = { | |
| 104 | 16416 | {std::ref(b1_0), std::ref(b1_1), std::ref(b1_2), std::ref(b1_3)}}; | |
| 105 | |||
| 106 | 16416 | load_rgb_2x(r0, g0, b0, r1, g1, b1, src_row, scn * index, pg0, pg1); | |
| 107 | |||
| 108 | 16416 | svuint8_t y0 = rgb_to_y(r0, g0, b0); | |
| 109 | |||
| 110 | 16416 | svuint8_t y1 = rgb_to_y(r1, g1, b1); | |
| 111 | |||
| 112 | #if KLEIDICV_TARGET_SME2 | ||
| 113 | // assume the predicates are full true | ||
| 114 | 1296 | svcount_t pg_counter = svptrue_c8(); | |
| 115 | 1296 | svst1(pg_counter, y_row + index, svcreate2(y0, y1)); | |
| 116 | #else | ||
| 117 | 15120 | svst1(pg0, y_row + index, y0); | |
| 118 | 15120 | svst1(pg1, y_row + index + svcntb(), y1); | |
| 119 | #endif // KLEIDICV_TARGET_SME2 | ||
| 120 | |||
| 121 |
8/8✓ Branch 0 taken 1976 times.
✓ Branch 1 taken 2128 times.
✓ Branch 2 taken 1976 times.
✓ Branch 3 taken 2128 times.
✓ Branch 4 taken 1976 times.
✓ Branch 5 taken 2128 times.
✓ Branch 6 taken 1976 times.
✓ Branch 7 taken 2128 times.
|
16416 | if (evenRow) { |
| 122 | 8512 | svuint8_t u, v; | |
| 123 | 8512 | svint32_t r_even_0 = svreinterpret_s32(r0_0); | |
| 124 | 8512 | svint32_t r_even_1 = svreinterpret_s32(r0_1); | |
| 125 | 8512 | svint32_t r_even_2 = svreinterpret_s32(r1_0); | |
| 126 | 8512 | svint32_t r_even_3 = svreinterpret_s32(r1_1); | |
| 127 | 8512 | svint32_t g_even_0 = svreinterpret_s32(g0_0); | |
| 128 | 8512 | svint32_t g_even_1 = svreinterpret_s32(g0_1); | |
| 129 | 8512 | svint32_t g_even_2 = svreinterpret_s32(g1_0); | |
| 130 | 8512 | svint32_t g_even_3 = svreinterpret_s32(g1_1); | |
| 131 | 8512 | svint32_t b_even_0 = svreinterpret_s32(b0_0); | |
| 132 | 8512 | svint32_t b_even_1 = svreinterpret_s32(b0_1); | |
| 133 | 8512 | svint32_t b_even_2 = svreinterpret_s32(b1_0); | |
| 134 | 8512 | svint32_t b_even_3 = svreinterpret_s32(b1_1); | |
| 135 | |||
| 136 | 17024 | ArrayOfFour_svint32 r_even = {{std::ref(r_even_0), std::ref(r_even_1), | |
| 137 | 17024 | std::ref(r_even_2), std::ref(r_even_3)}}; | |
| 138 | 17024 | ArrayOfFour_svint32 g_even = {{std::ref(g_even_0), std::ref(g_even_1), | |
| 139 | 17024 | std::ref(g_even_2), std::ref(g_even_3)}}; | |
| 140 | 17024 | ArrayOfFour_svint32 b_even = {{std::ref(b_even_0), std::ref(b_even_1), | |
| 141 | 17024 | std::ref(b_even_2), std::ref(b_even_3)}}; | |
| 142 | |||
| 143 | 8512 | rgb_to_uv_2x(r_even, g_even, b_even, u, v); | |
| 144 | |||
| 145 |
8/8✓ Branch 0 taken 1064 times.
✓ Branch 1 taken 1064 times.
✓ Branch 2 taken 1064 times.
✓ Branch 3 taken 1064 times.
✓ Branch 4 taken 1064 times.
✓ Branch 5 taken 1064 times.
✓ Branch 6 taken 1064 times.
✓ Branch 7 taken 1064 times.
|
8512 | if (v_first) { |
| 146 | 4256 | swap_scalable(u, v); | |
| 147 | 4256 | } | |
| 148 | |||
| 149 | if constexpr (kInterleave) { | ||
| 150 | 4256 | svuint8x2_t uv = svcreate2(u, v); | |
| 151 | 4256 | svst2_u8(pg_half, u_row + index, uv); | |
| 152 | 4256 | } else { | |
| 153 | 4256 | svst1(pg_half, u_row + index / 2, u); | |
| 154 | 4256 | svst1(pg_half, v_row + index / 2, v); | |
| 155 | } | ||
| 156 | 8512 | } | |
| 157 | 16416 | } | |
| 158 | |||
| 159 | 77328 | static void vector_path(const uint8_t *src_row, uint8_t *y_row, | |
| 160 | uint8_t *u_row, uint8_t *v_row, bool v_first, | ||
| 161 | const size_t index, const bool evenRow, | ||
| 162 | const svbool_t pg0, | ||
| 163 | const svbool_t pg_half) KLEIDICV_STREAMING { | ||
| 164 | 77328 | svuint32_t r0_0, r0_1, r0_2, r0_3, g0_0, g0_1, g0_2, g0_3, b0_0, b0_1, b0_2, | |
| 165 | b0_3; | ||
| 166 | |||
| 167 | 77328 | ArrayOfFour_svuint32 r0 = { | |
| 168 | 77328 | {std::ref(r0_0), std::ref(r0_1), std::ref(r0_2), std::ref(r0_3)}}; | |
| 169 | 77328 | ArrayOfFour_svuint32 g0 = { | |
| 170 | 77328 | {std::ref(g0_0), std::ref(g0_1), std::ref(g0_2), std::ref(g0_3)}}; | |
| 171 | 77328 | ArrayOfFour_svuint32 b0 = { | |
| 172 | 77328 | {std::ref(b0_0), std::ref(b0_1), std::ref(b0_2), std::ref(b0_3)}}; | |
| 173 | |||
| 174 | 77328 | load_rgb(r0, g0, b0, src_row, scn * index, pg0); | |
| 175 | |||
| 176 | 77328 | svuint8_t y0 = rgb_to_y(r0, g0, b0); | |
| 177 | |||
| 178 | 77328 | svst1(pg0, y_row + index, y0); | |
| 179 | |||
| 180 |
8/8✓ Branch 0 taken 9580 times.
✓ Branch 1 taken 9752 times.
✓ Branch 2 taken 9580 times.
✓ Branch 3 taken 9752 times.
✓ Branch 4 taken 9580 times.
✓ Branch 5 taken 9752 times.
✓ Branch 6 taken 9580 times.
✓ Branch 7 taken 9752 times.
|
77328 | if (evenRow) { |
| 181 | 39008 | svuint8_t u, v; | |
| 182 | 39008 | svint32_t r_even_0 = svreinterpret_s32(r0_0); | |
| 183 | 39008 | svint32_t r_even_1 = svreinterpret_s32(r0_1); | |
| 184 | 39008 | svint32_t g_even_0 = svreinterpret_s32(g0_0); | |
| 185 | 39008 | svint32_t g_even_1 = svreinterpret_s32(g0_1); | |
| 186 | 39008 | svint32_t b_even_0 = svreinterpret_s32(b0_0); | |
| 187 | 39008 | svint32_t b_even_1 = svreinterpret_s32(b0_1); | |
| 188 | |||
| 189 | 39008 | ArrayOfTwo_svint32 r_even = {{std::ref(r_even_0), std::ref(r_even_1)}}; | |
| 190 | 39008 | ArrayOfTwo_svint32 g_even = {{std::ref(g_even_0), std::ref(g_even_1)}}; | |
| 191 | 39008 | ArrayOfTwo_svint32 b_even = {{std::ref(b_even_0), std::ref(b_even_1)}}; | |
| 192 | |||
| 193 | 39008 | rgb_to_uv(r_even, g_even, b_even, u, v); | |
| 194 | |||
| 195 |
8/8✓ Branch 0 taken 9332 times.
✓ Branch 1 taken 420 times.
✓ Branch 2 taken 9332 times.
✓ Branch 3 taken 420 times.
✓ Branch 4 taken 9332 times.
✓ Branch 5 taken 420 times.
✓ Branch 6 taken 9332 times.
✓ Branch 7 taken 420 times.
|
39008 | if (v_first) { |
| 196 | 1680 | swap_scalable(u, v); | |
| 197 | 1680 | } | |
| 198 | |||
| 199 | if constexpr (kInterleave) { | ||
| 200 | 19728 | svuint8x2_t uv = svcreate2(u, v); | |
| 201 | 19728 | svst2_u8(pg_half, u_row + index, uv); | |
| 202 | 19728 | } else { | |
| 203 | 19280 | svst1(pg_half, u_row + index / 2, u); | |
| 204 | 19280 | svst1(pg_half, v_row + index / 2, v); | |
| 205 | } | ||
| 206 | 39008 | } | |
| 207 | 77328 | } | |
| 208 | |||
| 209 | 110160 | static svuint8_t rgb_to_y(ArrayOfFour_svuint32 r, ArrayOfFour_svuint32 g, | |
| 210 | ArrayOfFour_svuint32 b) KLEIDICV_STREAMING { | ||
| 211 | 110160 | const uint32_t kShifted16 = (16 << kWeightScale); | |
| 212 | 110160 | const uint32_t kHalfShift = (1 << (kWeightScale - 1)); | |
| 213 | |||
| 214 | 110160 | svbool_t pg = svptrue_b32(); | |
| 215 | |||
| 216 | // Y = kR*R + kG*G + kB*B + rounding bias | ||
| 217 | 110160 | svuint32_t bias = svdup_u32(kHalfShift + kShifted16); | |
| 218 | 110160 | svuint32_t y_0 = bias; | |
| 219 | 110160 | svuint32_t y_1 = bias; | |
| 220 | 110160 | svuint32_t y_2 = bias; | |
| 221 | 110160 | svuint32_t y_3 = bias; | |
| 222 | |||
| 223 | 110160 | ArrayOfFour_svuint32 y = { | |
| 224 | 110160 | {std::ref(y_0), std::ref(y_1), std::ref(y_2), std::ref(y_3)}}; | |
| 225 | |||
| 226 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 227 |
8/8✓ Branch 0 taken 27540 times.
✓ Branch 1 taken 110160 times.
✓ Branch 2 taken 27540 times.
✓ Branch 3 taken 110160 times.
✓ Branch 4 taken 27540 times.
✓ Branch 5 taken 110160 times.
✓ Branch 6 taken 27540 times.
✓ Branch 7 taken 110160 times.
|
550800 | for (int i = 0; i < 4; i++) { |
| 228 | 440640 | y(i) = svmla_n_u32_x(pg, y(i), r(i), kRYWeight); | |
| 229 | 440640 | y(i) = svmla_n_u32_x(pg, y(i), g(i), kGYWeight); | |
| 230 | 440640 | y(i) = svmla_n_u32_x(pg, y(i), b(i), kBYWeight); | |
| 231 | 440640 | } | |
| 232 | |||
| 233 | 110160 | svuint16_t y_b = svshrnb_n_u32(y(0), kWeightScale - 8); | |
| 234 | 110160 | y_b = svshrnt_n_u32(y_b, y(2), kWeightScale - 8); // 0, 1, 2, 3, 4, 5, 6, 7 | |
| 235 | 110160 | svuint16_t y_t = svshrnb_n_u32(y(1), kWeightScale - 8); | |
| 236 | 110160 | y_t = svshrnt_n_u32(y_t, y(3), | |
| 237 | kWeightScale - 8); // 8, 9, 10, 11, 12, 13, 14, 15 | ||
| 238 | |||
| 239 | 220320 | return svuzp2_u8(svreinterpret_u8(y_b), svreinterpret_u8(y_t)); | |
| 240 | 110160 | } | |
| 241 | |||
| 242 | 17024 | static svuint8_t compute_u_or_v_2x(ArrayOfFour_svint32 r, | |
| 243 | ArrayOfFour_svint32 g, | ||
| 244 | ArrayOfFour_svint32 b, const int r_coeff, | ||
| 245 | const int g_coeff, | ||
| 246 | const int b_coeff) KLEIDICV_STREAMING { | ||
| 247 | 17024 | svbool_t pg = svptrue_b32(); | |
| 248 | 17024 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
| 249 | 17024 | const int kShifted128 = (128 << kWeightScale); | |
| 250 | 17024 | svint32_t bias = svdup_s32(kHalfShift + kShifted128); | |
| 251 | 17024 | svint32_t uv0 = bias; | |
| 252 | 17024 | svint32_t uv1 = bias; | |
| 253 | 17024 | svint32_t uv2 = bias; | |
| 254 | 17024 | svint32_t uv3 = bias; | |
| 255 | |||
| 256 | 17024 | ArrayOfFour_svint32 uv = { | |
| 257 | 17024 | {std::ref(uv0), std::ref(uv1), std::ref(uv2), std::ref(uv3)}}; | |
| 258 | |||
| 259 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 260 |
8/8✓ Branch 0 taken 4256 times.
✓ Branch 1 taken 17024 times.
✓ Branch 2 taken 4256 times.
✓ Branch 3 taken 17024 times.
✓ Branch 4 taken 4256 times.
✓ Branch 5 taken 17024 times.
✓ Branch 6 taken 4256 times.
✓ Branch 7 taken 17024 times.
|
85120 | for (int i = 0; i < 4; i++) { |
| 261 | 68096 | uv(i) = svmla_n_s32_x(pg, uv(i), r(i), r_coeff); | |
| 262 | 68096 | uv(i) = svmla_n_s32_x(pg, uv(i), g(i), g_coeff); | |
| 263 | 68096 | uv(i) = svmla_n_s32_x(pg, uv(i), b(i), b_coeff); | |
| 264 | 68096 | } | |
| 265 | |||
| 266 | 34048 | svint16_t uv_b = | |
| 267 | 17024 | svuzp2_s16(svreinterpret_s16(uv(0)), svreinterpret_s16(uv(1))); | |
| 268 | 34048 | svint16_t uv_t = | |
| 269 | 17024 | svuzp2_s16(svreinterpret_s16(uv(2)), svreinterpret_s16(uv(3))); | |
| 270 | |||
| 271 | 17024 | uv_b = svasr_n_s16_x(pg, uv_b, kWeightScale - 16); | |
| 272 | 17024 | uv_t = svasr_n_s16_x(pg, uv_t, kWeightScale - 16); | |
| 273 | |||
| 274 | 34048 | return svuzp1_u8(svreinterpret_u8(uv_b), svreinterpret_u8(uv_t)); | |
| 275 | 17024 | } | |
| 276 | |||
| 277 | 8512 | static void rgb_to_uv_2x(ArrayOfFour_svint32 r, ArrayOfFour_svint32 g, | |
| 278 | ArrayOfFour_svint32 b, svuint8_t &u, | ||
| 279 | svuint8_t &v) KLEIDICV_STREAMING { | ||
| 280 | // ---------------- U (Cb) Component ---------------- | ||
| 281 | // U = R * kRU + G * kGU + B * kBU + bias | ||
| 282 | 8512 | u = compute_u_or_v_2x(r, g, b, kRUWeight, kGUWeight, kBUWeight); | |
| 283 | |||
| 284 | // ---------------- V (Cr) Component ---------------- | ||
| 285 | // V = R * kBU + G * kGV + B * kBV + bias | ||
| 286 | 8512 | v = compute_u_or_v_2x(r, g, b, kBUWeight, kGVWeight, kBVWeight); | |
| 287 | 8512 | } | |
| 288 | |||
| 289 | 78016 | static svuint8_t compute_u_or_v(ArrayOfTwo_svint32 r, ArrayOfTwo_svint32 g, | |
| 290 | ArrayOfTwo_svint32 b, const int r_coeff, | ||
| 291 | const int g_coeff, | ||
| 292 | const int b_coeff) KLEIDICV_STREAMING { | ||
| 293 | 78016 | svbool_t pg = svptrue_b32(); | |
| 294 | 78016 | const int kHalfShift = (1 << (kWeightScale - 1)); | |
| 295 | 78016 | const int kShifted128 = (128 << kWeightScale); | |
| 296 | |||
| 297 | 78016 | svint32_t bias = svdup_s32(kHalfShift + kShifted128); | |
| 298 | 78016 | svint32_t uv0 = bias; | |
| 299 | 78016 | svint32_t uv1 = bias; | |
| 300 | |||
| 301 | 78016 | ArrayOfTwo_svint32 uv = {{std::ref(uv0), std::ref(uv1)}}; | |
| 302 | |||
| 303 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 304 |
8/8✓ Branch 0 taken 19504 times.
✓ Branch 1 taken 39008 times.
✓ Branch 2 taken 19504 times.
✓ Branch 3 taken 39008 times.
✓ Branch 4 taken 19504 times.
✓ Branch 5 taken 39008 times.
✓ Branch 6 taken 19504 times.
✓ Branch 7 taken 39008 times.
|
234048 | for (int i = 0; i < 2; i++) { |
| 305 | 156032 | uv(i) = svmla_n_s32_x(pg, uv(i), r(i), r_coeff); | |
| 306 | 156032 | uv(i) = svmla_n_s32_x(pg, uv(i), g(i), g_coeff); | |
| 307 | 156032 | uv(i) = svmla_n_s32_x(pg, uv(i), b(i), b_coeff); | |
| 308 | 156032 | } | |
| 309 | |||
| 310 | 156032 | svint16_t output = | |
| 311 | 78016 | svuzp2_s16(svreinterpret_s16(uv(0)), svreinterpret_s16(uv(1))); | |
| 312 | |||
| 313 | 78016 | output = svasr_n_s16_x(pg, output, kWeightScale - 16); | |
| 314 | |||
| 315 | 156032 | return svuzp1_u8(svreinterpret_u8(output), svreinterpret_u8(output)); | |
| 316 | 78016 | } | |
| 317 | |||
| 318 | 39008 | static void rgb_to_uv(ArrayOfTwo_svint32 r, ArrayOfTwo_svint32 g, | |
| 319 | ArrayOfTwo_svint32 b, svuint8_t &u, | ||
| 320 | svuint8_t &v) KLEIDICV_STREAMING { | ||
| 321 | // ---------------- U (Cb) Component ---------------- | ||
| 322 | // U = R * kRU + G * kGU + B * kBU + bias | ||
| 323 | 39008 | u = compute_u_or_v(r, g, b, kRUWeight, kGUWeight, kBUWeight); | |
| 324 | |||
| 325 | // ---------------- V (Cr) Component ---------------- | ||
| 326 | // V = R * kBU + G * kGV + B * kBV + bias | ||
| 327 | 39008 | v = compute_u_or_v(r, g, b, kBUWeight, kGVWeight, kBVWeight); | |
| 328 | 39008 | } | |
| 329 | |||
| 330 | 110160 | static void load_rgb(ArrayOfFour_svuint32 &r, ArrayOfFour_svuint32 &g, | |
| 331 | ArrayOfFour_svuint32 &b, const uint8_t *src_row, | ||
| 332 | const size_t w, const svbool_t &pg0) KLEIDICV_STREAMING { | ||
| 333 | 110160 | svuint8_t b0, g0, r0; | |
| 334 | if constexpr (kAlpha) { | ||
| 335 | // 4-channel input (RGBA or BGRA) | ||
| 336 | 55080 | svuint8x4_t vsrc0 = svld4(pg0, src_row + w); | |
| 337 | |||
| 338 | 55080 | b0 = svget4(vsrc0, b_index); | |
| 339 | 55080 | g0 = svget4(vsrc0, g_index); | |
| 340 | 55080 | r0 = svget4(vsrc0, r_index); | |
| 341 | |||
| 342 | 55080 | } else { | |
| 343 | // 3-channel input (RGB or BGR) | ||
| 344 | 55080 | svuint8x3_t vsrc0 = svld3(pg0, src_row + w); | |
| 345 | |||
| 346 | 55080 | b0 = svget3(vsrc0, b_index); | |
| 347 | 55080 | g0 = svget3(vsrc0, g_index); | |
| 348 | 55080 | r0 = svget3(vsrc0, r_index); | |
| 349 | 55080 | } | |
| 350 | 110160 | svuint16_t r0_lo = svmovlb(r0); | |
| 351 | 110160 | svuint16_t r0_hi = svmovlt(r0); | |
| 352 | 110160 | r(0) = svunpklo(r0_lo); // 0, 2, 4, 6 | |
| 353 | 110160 | r(1) = svunpkhi(r0_lo); // 8, 10, 12, 14 | |
| 354 | 110160 | r(2) = svunpklo(r0_hi); // 1, 3, 5, 7 | |
| 355 | 110160 | r(3) = svunpkhi(r0_hi); // 9, 11, 13, 15 | |
| 356 | |||
| 357 | 110160 | svuint16_t g0_lo = svmovlb(g0); | |
| 358 | 110160 | svuint16_t g0_hi = svmovlt(g0); | |
| 359 | 110160 | g(0) = svunpklo(g0_lo); | |
| 360 | 110160 | g(1) = svunpkhi(g0_lo); | |
| 361 | 110160 | g(2) = svunpklo(g0_hi); | |
| 362 | 110160 | g(3) = svunpkhi(g0_hi); | |
| 363 | |||
| 364 | 110160 | svuint16_t b0_lo = svmovlb(b0); | |
| 365 | 110160 | svuint16_t b0_hi = svmovlt(b0); | |
| 366 | 110160 | b(0) = svunpklo(b0_lo); | |
| 367 | 110160 | b(1) = svunpkhi(b0_lo); | |
| 368 | 110160 | b(2) = svunpklo(b0_hi); | |
| 369 | 110160 | b(3) = svunpkhi(b0_hi); | |
| 370 | 110160 | } | |
| 371 | |||
| 372 | 16416 | static void load_rgb_2x(ArrayOfFour_svuint32 &r0, ArrayOfFour_svuint32 &g0, | |
| 373 | ArrayOfFour_svuint32 &b0, ArrayOfFour_svuint32 &r1, | ||
| 374 | ArrayOfFour_svuint32 &g1, ArrayOfFour_svuint32 &b1, | ||
| 375 | const uint8_t *src_row, const size_t w, | ||
| 376 | const svbool_t pg0, | ||
| 377 | const svbool_t pg1) KLEIDICV_STREAMING { | ||
| 378 | 16416 | const size_t kVectorLength = svcntb(); | |
| 379 | 16416 | load_rgb(r0, g0, b0, src_row, w, pg0); | |
| 380 | |||
| 381 | 16416 | load_rgb(r1, g1, b1, src_row, w + scn * kVectorLength, pg1); | |
| 382 | 16416 | } | |
| 383 | |||
| 384 | static constexpr int b_index = RGB ? 2 : 0; | ||
| 385 | static constexpr int g_index = 1; | ||
| 386 | static constexpr int r_index = RGB ? 0 : 2; | ||
| 387 | static constexpr size_t scn = kAlpha ? 4 : 3; | ||
| 388 | }; | ||
| 389 | |||
| 390 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 391 | |||
| 392 | #endif // KLEIDICV_RGB_TO_YUV420_SC_H | ||
| 393 |