| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_GRAY_TO_RGB_SC_H | ||
| 6 | #define KLEIDICV_GRAY_TO_RGB_SC_H | ||
| 7 | |||
| 8 | #include "kleidicv/conversions/gray_to_rgb.h" | ||
| 9 | #include "kleidicv/sve2.h" | ||
| 10 | |||
| 11 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 12 | |||
| 13 | template <typename ScalarType> | ||
| 14 | class GrayToRGB final : | ||
| 15 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 16 | public UsesTailPath, | ||
| 17 | #endif | ||
| 18 | public UnrollTwice { | ||
| 19 | public: | ||
| 20 | using ContextType = Context; | ||
| 21 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
| 22 | using VectorType = typename VecTraits::VectorType; | ||
| 23 | using Vector3Type = typename VecTraits::Vector3Type; | ||
| 24 | |||
| 25 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 26 | void vector_path(ContextType ctx, VectorType src_vect, | ||
| 27 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 28 | auto pg = ctx.predicate(); | ||
| 29 | svuint8x3_t dst_vect = svcreate3(src_vect, src_vect, src_vect); | ||
| 30 | svst3(pg, dst, dst_vect); | ||
| 31 | } | ||
| 32 | #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 33 | 195 | explicit GrayToRGB(svuint8x3_t &indices) KLEIDICV_STREAMING | |
| 34 | 195 | : indices_{indices} { | |
| 35 | 195 | initialize_indices(); | |
| 36 | 195 | } | |
| 37 | |||
| 38 | 666 | void vector_path(ContextType, VectorType src_vect, | |
| 39 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 40 | 666 | Vector3Type dst_vect = common_vector_path(src_vect); | |
| 41 | #if KLEIDICV_TARGET_SME2 | ||
| 42 | 102 | two_plus_one_store(dst, dst_vect); | |
| 43 | #else | ||
| 44 | 564 | svbool_t pg = VecTraits::svptrue(); | |
| 45 | 564 | common_store(pg, pg, pg, dst, dst_vect); | |
| 46 | #endif | ||
| 47 | 666 | } | |
| 48 | |||
| 49 | 188 | void tail_path(ContextType ctx, VectorType src_vect, | |
| 50 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 51 | 188 | auto pg = ctx.predicate(); | |
| 52 | // Predicates for consecutive stores. | ||
| 53 | 188 | svbool_t pg_0, pg_1, pg_2; | |
| 54 | 188 | VecTraits::make_consecutive_predicates(pg, pg_0, pg_1, pg_2); | |
| 55 | // Call the common vector path. | ||
| 56 | 188 | Vector3Type dst_vect = common_vector_path(src_vect); | |
| 57 | 188 | common_store(pg_0, pg_1, pg_2, dst, dst_vect); | |
| 58 | 188 | } | |
| 59 | |||
| 60 | private: | ||
| 61 | 854 | Vector3Type common_vector_path(VectorType src_vect) KLEIDICV_STREAMING { | |
| 62 | // Convert from gray to RGB using table-lookups. | ||
| 63 | 1708 | return svcreate3(svtbl(src_vect, svget3(indices_, 0)), | |
| 64 | 854 | svtbl(src_vect, svget3(indices_, 1)), | |
| 65 | 854 | svtbl(src_vect, svget3(indices_, 2))); | |
| 66 | } | ||
| 67 | |||
| 68 | #if KLEIDICV_TARGET_SME2 | ||
| 69 | 102 | void two_plus_one_store(ScalarType *dst, | |
| 70 | Vector3Type dst_vect) KLEIDICV_STREAMING { | ||
| 71 | 102 | svcount_t p_counter = VecTraits::svptrue_c(); | |
| 72 | 102 | svst1(p_counter, dst, svcreate2(svget3(dst_vect, 0), svget3(dst_vect, 1))); | |
| 73 | 102 | svst1_vnum(VecTraits::svptrue(), dst, 2, svget3(dst_vect, 2)); | |
| 74 | 102 | } | |
| 75 | #endif | ||
| 76 | |||
| 77 | 752 | void common_store(svbool_t pg_0, svbool_t pg_1, svbool_t pg_2, | |
| 78 | ScalarType *dst, Vector3Type dst_vect) KLEIDICV_STREAMING { | ||
| 79 | 752 | svst1(pg_0, &dst[0], svget3(dst_vect, 0)); | |
| 80 | 752 | svst1_vnum(pg_1, &dst[0], 1, svget3(dst_vect, 1)); | |
| 81 | 752 | svst1_vnum(pg_2, &dst[0], 2, svget3(dst_vect, 2)); | |
| 82 | 752 | } | |
| 83 | |||
| 84 | 195 | void initialize_indices() KLEIDICV_STREAMING { | |
| 85 | // All-true predicate to shorten code. | ||
| 86 | 195 | svbool_t pg_all = VecTraits::svptrue(); | |
| 87 | // Constant used for division by 3. | ||
| 88 | 195 | VectorType const_171 = VecTraits::svdup(171); | |
| 89 | // Generated indices. | ||
| 90 | 195 | VectorType indices_0, indices_1, indices_2; | |
| 91 | |||
| 92 | 195 | indices_0 = svindex_u8(0, 1); | |
| 93 | |||
| 94 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 192 times.
|
195 | if (KLEIDICV_UNLIKELY(svcntb() == 256)) { |
| 95 | 3 | indices_1 = svext( | |
| 96 | 3 | svdup_u8(0), | |
| 97 | 3 | svqadd(svindex_u8(svcntb() % 3, 1), static_cast<uint8_t>(2)), 254); | |
| 98 | 6 | indices_2 = svext(svdup_u8(0), | |
| 99 | 3 | svqadd(svindex_u8(0, 1), static_cast<uint8_t>(3)), 255); | |
| 100 | 3 | } else { | |
| 101 | 192 | indices_1 = svindex_u8(svcntb() % 3, 1); | |
| 102 | 192 | indices_2 = svindex_u8((svcntb() * 2) % 3, 1); | |
| 103 | } | ||
| 104 | |||
| 105 | 195 | indices_0 = svlsr_x(pg_all, svmulh_x(pg_all, indices_0, const_171), 1); | |
| 106 | 195 | indices_1 = svqadd_x( | |
| 107 | 195 | pg_all, svlsr_x(pg_all, svmulh_x(pg_all, indices_1, const_171), 1), | |
| 108 | 195 | static_cast<ScalarType>(svcntb() / 3)); | |
| 109 | 195 | indices_2 = svqadd_x( | |
| 110 | 195 | pg_all, svlsr_x(pg_all, svmulh_x(pg_all, indices_2, const_171), 1), | |
| 111 | 195 | static_cast<ScalarType>((svcntb() * 2) / 3)); | |
| 112 | |||
| 113 | 195 | indices_ = svcreate3(indices_0, indices_1, indices_2); | |
| 114 | 195 | } | |
| 115 | |||
| 116 | svuint8x3_t &indices_; | ||
| 117 | #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 118 | }; // end of class GrayToRGB<ScalarType> | ||
| 119 | |||
| 120 | template <typename ScalarType> | ||
| 121 | class GrayToRGBAWithInterleaving final : public UnrollTwice { | ||
| 122 | public: | ||
| 123 | using ContextType = Context; | ||
| 124 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
| 125 | using VectorType = typename VecTraits::VectorType; | ||
| 126 | 24 | void vector_path(ContextType ctx, VectorType src_vect, | |
| 127 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 128 | 24 | auto pg = ctx.predicate(); | |
| 129 | 24 | svuint8_t alpha = svdup_u8(0xff); | |
| 130 | 24 | svuint8x4_t dst_vect = svcreate4(src_vect, src_vect, src_vect, alpha); | |
| 131 | |||
| 132 | 24 | svst4(pg, dst, dst_vect); | |
| 133 | 24 | } | |
| 134 | }; // end of class GrayToRGBAWithInterleaving<ScalarType> | ||
| 135 | |||
| 136 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 137 | template <typename ScalarType> | ||
| 138 | class GrayToRGBAWithLookUpTable final : public UnrollTwice, | ||
| 139 | public UsesTailPath { | ||
| 140 | public: | ||
| 141 | using ContextType = Context; | ||
| 142 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<ScalarType>; | ||
| 143 | using VectorType = typename VecTraits::VectorType; | ||
| 144 | using Vector4Type = typename VecTraits::Vector4Type; | ||
| 145 | 192 | explicit GrayToRGBAWithLookUpTable(svuint8x4_t &indices) KLEIDICV_STREAMING | |
| 146 | 192 | : indices_{indices} { | |
| 147 | 192 | initialize_indices(); | |
| 148 | 192 | } | |
| 149 | |||
| 150 | 646 | void vector_path(ContextType, VectorType src_vect, | |
| 151 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 152 | // Call the common vector path. | ||
| 153 | 646 | Vector4Type dst_vect = common_vector_path(src_vect); | |
| 154 | #if KLEIDICV_TARGET_SME2 | ||
| 155 | 102 | svcount_t p_counter = VecTraits::svptrue_c(); | |
| 156 | 102 | svst1(p_counter, &dst[0], dst_vect); | |
| 157 | #else | ||
| 158 | 544 | svbool_t pg = VecTraits::svptrue(); | |
| 159 | 544 | common_store(pg, pg, pg, pg, dst, dst_vect); | |
| 160 | #endif | ||
| 161 | 646 | } | |
| 162 | |||
| 163 | 184 | void tail_path(ContextType ctx, VectorType src_vect, | |
| 164 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 165 | 184 | auto pg = ctx.predicate(); | |
| 166 | // Predicates for consecutive stores. | ||
| 167 | 184 | svbool_t pg_0, pg_1, pg_2, pg_3; | |
| 168 | 184 | VecTraits::make_consecutive_predicates(pg, pg_0, pg_1, pg_2, pg_3); | |
| 169 | // Call the common vector path. | ||
| 170 | 184 | Vector4Type dst_vect = common_vector_path(src_vect); | |
| 171 | 184 | common_store(pg_0, pg_1, pg_2, pg_3, dst, dst_vect); | |
| 172 | 184 | } | |
| 173 | |||
| 174 | private: | ||
| 175 | 830 | Vector4Type common_vector_path(VectorType src_vect) KLEIDICV_STREAMING { | |
| 176 | 830 | svuint8x2_t src_and_alpha = svcreate2(src_vect, VecTraits::svdup(-1)); | |
| 177 | // Convert from gray to RGBA using table-lookups. | ||
| 178 | 2490 | return svcreate4(svtbl2(src_and_alpha, svget4(indices_, 0)), | |
| 179 | 830 | svtbl2(src_and_alpha, svget4(indices_, 1)), | |
| 180 | 830 | svtbl2(src_and_alpha, svget4(indices_, 2)), | |
| 181 | 830 | svtbl2(src_and_alpha, svget4(indices_, 3))); | |
| 182 | 830 | } | |
| 183 | |||
| 184 | 728 | void common_store(svbool_t pg_0, svbool_t pg_1, svbool_t pg_2, svbool_t pg_3, | |
| 185 | ScalarType *dst, Vector4Type dst_vect) KLEIDICV_STREAMING { | ||
| 186 | 728 | svst1(pg_0, &dst[0], svget4(dst_vect, 0)); | |
| 187 | 728 | svst1_vnum(pg_1, &dst[0], 1, svget4(dst_vect, 1)); | |
| 188 | 728 | svst1_vnum(pg_2, &dst[0], 2, svget4(dst_vect, 2)); | |
| 189 | 728 | svst1_vnum(pg_3, &dst[0], 3, svget4(dst_vect, 3)); | |
| 190 | 728 | } | |
| 191 | |||
| 192 | 192 | void initialize_indices() KLEIDICV_STREAMING { | |
| 193 | // Number of four-tuple elements. | ||
| 194 | 192 | uint64_t num_four_tuples = VecTraits::num_lanes() / 4; | |
| 195 | // Index of alpha. | ||
| 196 | 192 | uint64_t idx_alpha = VecTraits::num_lanes(); | |
| 197 | // Start index. | ||
| 198 | 192 | uint64_t start_index = idx_alpha << 24; | |
| 199 | |||
| 200 | // Index generation is similar to that of GrayToRGB above. | ||
| 201 | 384 | VectorType indices_0 = | |
| 202 | 192 | svreinterpret_u8_u32(svindex_u32(start_index, 0x10101)); | |
| 203 | |||
| 204 | // Repeat for 'indices_1' but add number of 4-tuple elements. | ||
| 205 | 192 | start_index += 0x10101 * num_four_tuples; | |
| 206 | 384 | VectorType indices_1 = | |
| 207 | 192 | svreinterpret_u8_u32(svindex_u32(start_index, 0x10101)); | |
| 208 | |||
| 209 | // Similarly to 'indices_1', but add twice the number of 4-tuple elements. | ||
| 210 | 192 | start_index += 0x10101 * num_four_tuples; | |
| 211 | 384 | VectorType indices_2 = | |
| 212 | 192 | svreinterpret_u8_u32(svindex_u32(start_index, 0x10101)); | |
| 213 | |||
| 214 | // Similarly to 'indices_1', but add three times the number of 4-tuple | ||
| 215 | // elements. | ||
| 216 | 192 | start_index += 0x10101 * num_four_tuples; | |
| 217 | 384 | VectorType indices_3 = | |
| 218 | 192 | svreinterpret_u8_u32(svindex_u32(start_index, 0x10101)); | |
| 219 | |||
| 220 | 192 | indices_ = svcreate4(indices_0, indices_1, indices_2, indices_3); | |
| 221 | 192 | } | |
| 222 | |||
| 223 | svuint8x4_t &indices_; | ||
| 224 | }; // end of class GrayToRGBAWithLookUpTable<ScalarType> | ||
| 225 | #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 226 | |||
| 227 | 243 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t gray_to_rgb_u8_sc( | |
| 228 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 229 | size_t width, size_t height) KLEIDICV_STREAMING { | ||
| 230 |
4/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 231 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 231 times.
|
243 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 231 |
4/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 219 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 219 times.
|
231 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 232 |
6/6✓ Branch 0 taken 12 times.
✓ Branch 1 taken 207 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 195 times.
|
219 | CHECK_IMAGE_SIZE(width, height); |
| 233 | |||
| 234 | 195 | Rectangle rect{width, height}; | |
| 235 | 195 | Rows<const uint8_t> src_rows{src, src_stride}; | |
| 236 | 195 | Rows<uint8_t> dst_rows{dst, dst_stride, 3 /* RGB */}; | |
| 237 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 238 | GrayToRGB<uint8_t> operation; | ||
| 239 | #else | ||
| 240 | 195 | svuint8x3_t table_indices; | |
| 241 | 195 | GrayToRGB<uint8_t> operation{table_indices}; | |
| 242 | #endif | ||
| 243 | 195 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 244 | 195 | return KLEIDICV_OK; | |
| 245 | 243 | } | |
| 246 | |||
| 247 | 243 | KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t gray_to_rgba_u8_sc( | |
| 248 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 249 | size_t width, size_t height) KLEIDICV_STREAMING { | ||
| 250 |
4/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 231 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 231 times.
|
243 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 251 |
4/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 219 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 219 times.
|
231 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 252 |
6/6✓ Branch 0 taken 12 times.
✓ Branch 1 taken 207 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 195 times.
|
219 | CHECK_IMAGE_SIZE(width, height); |
| 253 | |||
| 254 | 195 | Rectangle rect{width, height}; | |
| 255 | 195 | Rows<const uint8_t> src_rows{src, src_stride}; | |
| 256 | 195 | Rows<uint8_t> dst_rows{dst, dst_stride, 4 /* RGBA */}; | |
| 257 | |||
| 258 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 259 | GrayToRGBAWithInterleaving<uint8_t> operation{}; | ||
| 260 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | ||
| 261 | #else | ||
| 262 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 192 times.
|
195 | if (svcntb() > 128) { |
| 263 | 3 | GrayToRGBAWithInterleaving<uint8_t> operation{}; | |
| 264 | 3 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 265 | 3 | } else { | |
| 266 | 192 | svuint8x4_t table_indices; | |
| 267 | 192 | GrayToRGBAWithLookUpTable<uint8_t> operation{table_indices}; | |
| 268 | 192 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
| 269 | 192 | } | |
| 270 | #endif | ||
| 271 | 195 | return KLEIDICV_OK; | |
| 272 | 243 | } | |
| 273 | |||
| 274 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 275 | |||
| 276 | #endif // KLEIDICV_GRAY_TO_RGB_SC_H | ||
| 277 |