| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/conversions/merge.h" | ||
| 6 | #include "kleidicv/kleidicv.h" | ||
| 7 | #include "kleidicv/neon.h" | ||
| 8 | |||
| 9 | namespace kleidicv::neon { | ||
| 10 | |||
| 11 | // ---------------------------------------- | ||
| 12 | // ------------ Two-way merge ------------- | ||
| 13 | // ---------------------------------------- | ||
| 14 | |||
| 15 | // Generic 2-way merge implementation. | ||
| 16 | // | ||
| 17 | // Algorithm description | ||
| 18 | // | ||
| 19 | // Elements are identified by their intended final position in the output. | ||
| 20 | // The description is for 32-bit elements, but it works just the same way | ||
| 21 | // for different element sizes. | ||
| 22 | // | ||
| 23 | // VECTOR / LANE: 0 1 2 3 | ||
| 24 | // src_a: [ 0, 2, 4, 6 ] | ||
| 25 | // src_b: [ 1, 3, 5, 7 ] | ||
| 26 | // | ||
| 27 | // zip1(a, b): [ 0, 1, 2, 3 ] -> d0 | ||
| 28 | // zip2(a, b): [ 4, 5, 6, 7 ] -> d1 | ||
| 29 | // | ||
| 30 | // Continuous store of { d0, d1 } gives the expected order. | ||
| 31 | template <typename ScalarType> | ||
| 32 | class Merge2 final : public UnrollTwice { | ||
| 33 | public: | ||
| 34 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 35 | using VectorType = typename VecTraits::VectorType; | ||
| 36 | using Vector2Type = typename VecTraits::Vector2Type; | ||
| 37 | |||
| 38 | 1600 | void vector_path(VectorType src_a, VectorType src_b, ScalarType *dst) { | |
| 39 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 40 | Vector2Type dst_vect; | ||
| 41 | dst_vect.val[0] = src_a; | ||
| 42 | dst_vect.val[1] = src_b; | ||
| 43 | vst2q(&dst[0], dst_vect); | ||
| 44 | #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 45 | 1600 | Vector2Type dst_vect; | |
| 46 | 1600 | dst_vect.val[0] = vzip1q(src_a, src_b); | |
| 47 | 1600 | dst_vect.val[1] = vzip2q(src_a, src_b); | |
| 48 | 1600 | VecTraits::store(dst_vect, &dst[0]); | |
| 49 | |||
| 50 | #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 51 | 1600 | } | |
| 52 | |||
| 53 | 256 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
| 54 | ScalarType *dst) { | ||
| 55 | 256 | dst[0] = src_a[0]; | |
| 56 | 256 | dst[1] = src_b[0]; | |
| 57 | 256 | } | |
| 58 | }; // end of class Merge2<ScalarType> | ||
| 59 | |||
| 60 | // ---------------------------------------- | ||
| 61 | // ---------- Three-way merge ------------- | ||
| 62 | // ---------------------------------------- | ||
| 63 | |||
| 64 | template <typename ScalarType> | ||
| 65 | class Merge3 final : public UnrollTwice { | ||
| 66 | public: | ||
| 67 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 68 | using VectorType = typename VecTraits::VectorType; | ||
| 69 | using Vector3Type = typename VecTraits::Vector3Type; | ||
| 70 | |||
| 71 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 72 | |||
| 73 | 136 | Merge3() : table_indices_{} { | |
| 74 | 136 | neon::VecTraits<uint8_t>::load(lookup_table(ScalarType()), table_indices_); | |
| 75 | 136 | } | |
| 76 | |||
| 77 | #endif | ||
| 78 | |||
| 79 | 1440 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
| 80 | ScalarType *dst) { | ||
| 81 | 1440 | Vector3Type dst_vect; | |
| 82 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 83 | dst_vect.val[0] = src_a; | ||
| 84 | dst_vect.val[1] = src_b; | ||
| 85 | dst_vect.val[2] = src_c; | ||
| 86 | vst3q(&dst[0], dst_vect); | ||
| 87 | #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 88 | 1440 | uint8x16x3_t src_vect; | |
| 89 | 1440 | src_vect.val[0] = vreinterpretq_u8(src_a); | |
| 90 | 1440 | src_vect.val[1] = vreinterpretq_u8(src_b); | |
| 91 | 1440 | src_vect.val[2] = vreinterpretq_u8(src_c); | |
| 92 | 1440 | dst_vect.val[0] = vqtbl3q_u8(src_vect, table_indices_.val[0]); | |
| 93 | 1440 | dst_vect.val[1] = vqtbl3q_u8(src_vect, table_indices_.val[1]); | |
| 94 | 1440 | dst_vect.val[2] = vqtbl3q_u8(src_vect, table_indices_.val[2]); | |
| 95 | 1440 | VecTraits::store(dst_vect, &dst[0]); | |
| 96 | #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 97 | 1440 | } | |
| 98 | |||
| 99 | 256 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
| 100 | const ScalarType *src_c, ScalarType *dst) { | ||
| 101 | 256 | dst[0] = src_a[0]; | |
| 102 | 256 | dst[1] = src_b[0]; | |
| 103 | 256 | dst[2] = src_c[0]; | |
| 104 | 256 | } | |
| 105 | |||
| 106 | private: | ||
| 107 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 108 | 68 | static uint8_t *lookup_table(uint8_t) { | |
| 109 | // clang-format off | ||
| 110 | static uint8_t kIndices[48] = { | ||
| 111 | 0, 16, 32, 1, 17, 33, 2, 18, 34, 3, 19, 35, 4, 20, 36, 5, | ||
| 112 | 21, 37, 6, 22, 38, 7, 23, 39, 8, 24, 40, 9, 25, 41, 10, 26, | ||
| 113 | 42, 11, 27, 43, 12, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, | ||
| 114 | }; | ||
| 115 | 68 | return &kIndices[0]; | |
| 116 | } | ||
| 117 | |||
| 118 | // Lookup table for 16-bit inputs. | ||
| 119 | 68 | static uint8_t *lookup_table(uint16_t) { | |
| 120 | // clang-format off | ||
| 121 | static uint8_t kIndices[48] = { | ||
| 122 | 0, 1, 16, 17, 32, 33, 2, 3, 18, 19, 34, 35, 4, 5, 20, 21, | ||
| 123 | 36, 37, 6, 7, 22, 23, 38, 39, 8, 9, 24, 25, 40, 41, 10, 11, | ||
| 124 | 26, 27, 42, 43, 12, 13, 28, 29, 44, 45, 14, 15, 30, 31, 46, 47, | ||
| 125 | }; | ||
| 126 | // clang-format on | ||
| 127 | 68 | return &kIndices[0]; | |
| 128 | } | ||
| 129 | |||
| 130 | uint8x16x3_t table_indices_; | ||
| 131 | #endif | ||
| 132 | }; // end of class Merge3<ScalarType> | ||
| 133 | |||
| 134 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 135 | |||
| 136 | // Specialized 3-way merge implementation for 32-bit elements. | ||
| 137 | // | ||
| 138 | // Algorithm description | ||
| 139 | // | ||
| 140 | // Elements are identified by their intended final position in the output. | ||
| 141 | // | ||
| 142 | // VECTOR / LANE: 0 1 2 3 | ||
| 143 | // src_a: [ 0, 3, 6, 9 ] | ||
| 144 | // src_b: [ 1, 4, 7, 10 ] | ||
| 145 | // src_c: [ 2, 5, 8, 11 ] | ||
| 146 | // | ||
| 147 | // trn2(a, b): [ 3, 4, 9, 10 ] -> w | ||
| 148 | // trn1(c, w): [ 2, 3, 8, 9 ] -> x | ||
| 149 | // trn2(w, c): [ 4, 5, 10, 11 ] -> y | ||
| 150 | // trn1(a, b): [ 0, 1, 6, 7 ] -> z | ||
| 151 | // | ||
| 152 | // zip1_u64(z, x): [ 0, 1, 2, 3 ] -> d0 | ||
| 153 | // [ y_u64[0], z_u64[1] ]: [ 4, 5, 6, 7 ] -> d1 | ||
| 154 | // zip2_u64(x, y): [ 8, 9, 10, 11 ] -> d2 | ||
| 155 | // | ||
| 156 | // Continuous store of { d0, d1, d2 } gives the expected order. | ||
| 157 | template <> | ||
| 158 | class Merge3<uint32_t> final : public UnrollTwice { | ||
| 159 | public: | ||
| 160 | using ScalarType = uint32_t; | ||
| 161 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 162 | using VectorType = typename VecTraits::VectorType; | ||
| 163 | using Vector3Type = typename VecTraits::Vector3Type; | ||
| 164 | |||
| 165 | 720 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
| 166 | ScalarType *dst) { | ||
| 167 | 720 | uint32x4_t w = vtrn2q_u32(src_a, src_b); | |
| 168 | 720 | uint32x4_t x = vtrn1q_u32(src_c, w); | |
| 169 | 720 | uint32x4_t y = vtrn2q_u32(w, src_c); | |
| 170 | 720 | uint32x4_t z = vtrn1q_u32(src_a, src_b); | |
| 171 | |||
| 172 | 720 | uint32x4_t dst_vect_0 = vzip1q_u64(z, x); | |
| 173 | 720 | uint64x2_t dst_vect_1 = y; | |
| 174 | 720 | dst_vect_1[1] = vreinterpretq_u64_u32(z)[1]; | |
| 175 | 720 | uint32x4_t dst_vect_2 = vzip2q_u64(x, y); | |
| 176 | |||
| 177 | // Not using vst1q_u32_x3, becuse the requirement on continuous vector | ||
| 178 | // register allocation may result in longer code. | ||
| 179 | 720 | vst1q_u32(&dst[0 * VecTraits::num_lanes()], dst_vect_0); | |
| 180 | 720 | vst1q_u32(&dst[1 * VecTraits::num_lanes()], dst_vect_1); | |
| 181 | 720 | vst1q_u32(&dst[2 * VecTraits::num_lanes()], dst_vect_2); | |
| 182 | 720 | } | |
| 183 | |||
| 184 | 128 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
| 185 | const ScalarType *src_c, ScalarType *dst) { | ||
| 186 | 128 | dst[0] = src_a[0]; | |
| 187 | 128 | dst[1] = src_b[0]; | |
| 188 | 128 | dst[2] = src_c[0]; | |
| 189 | 128 | } | |
| 190 | }; // end of class Merge3<uint32_t> | ||
| 191 | |||
| 192 | // Specialized 3-way merge implementation for 64-bit elements. | ||
| 193 | // | ||
| 194 | // Algorithm description | ||
| 195 | // | ||
| 196 | // Elements are identified by their intended final position in the output. | ||
| 197 | // | ||
| 198 | // VECTOR / LANE: 0 1 | ||
| 199 | // src_a: [ 0, 3 ] | ||
| 200 | // src_b: [ 1, 4 ] | ||
| 201 | // src_c: [ 2, 5 ] | ||
| 202 | // | ||
| 203 | // zip1(a, b): [ 0, 1 ] -> d0 | ||
| 204 | // [ src_c[0], src_a[1] ]: [ 2, 3 ] -> d1 | ||
| 205 | // zip2(b, c): [ 4, 5 ] -> d2 | ||
| 206 | // | ||
| 207 | // Continuous store of { d0, d1, d2 } gives the expected order. | ||
| 208 | template <> | ||
| 209 | class Merge3<uint64_t> final : public UnrollTwice { | ||
| 210 | public: | ||
| 211 | using ScalarType = uint64_t; | ||
| 212 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 213 | using VectorType = typename VecTraits::VectorType; | ||
| 214 | |||
| 215 | 720 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
| 216 | ScalarType *dst) { | ||
| 217 | 720 | uint64x2x3_t dst_vect; | |
| 218 | 720 | dst_vect.val[0] = vzip1q_u64(src_a, src_b); | |
| 219 | 720 | dst_vect.val[1] = src_c; | |
| 220 | 720 | dst_vect.val[1][1] = src_a[1]; | |
| 221 | 720 | dst_vect.val[2] = vzip2q_u64(src_b, src_c); | |
| 222 | |||
| 223 | 720 | VecTraits::store(dst_vect, &dst[0]); | |
| 224 | 720 | } | |
| 225 | |||
| 226 | 128 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
| 227 | const ScalarType *src_c, ScalarType *dst) { | ||
| 228 | 128 | dst[0] = src_a[0]; | |
| 229 | 128 | dst[1] = src_b[0]; | |
| 230 | 128 | dst[2] = src_c[0]; | |
| 231 | 128 | } | |
| 232 | }; // end of class Merge3<uint64_t> | ||
| 233 | |||
| 234 | #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 235 | |||
| 236 | // ---------------------------------------- | ||
| 237 | // ----------- Four-way merge ------------- | ||
| 238 | // ---------------------------------------- | ||
| 239 | |||
| 240 | // Generic 4-way merge implementation. | ||
| 241 | // | ||
| 242 | // Algorithm description | ||
| 243 | // | ||
| 244 | // Elements are identified by their intended final position in the output. | ||
| 245 | // The description is for 32-bit elements, but it works just the same way | ||
| 246 | // for smaller element sizes. | ||
| 247 | // | ||
| 248 | // VECTOR / LANE: 0 1 2 3 | ||
| 249 | // src_a: [ 0, 4, 8, 12 ] | ||
| 250 | // src_b: [ 1, 5, 9, 13 ] | ||
| 251 | // src_c: [ 2, 6, 10, 14 ] | ||
| 252 | // src_d: [ 3, 7, 11, 15 ] | ||
| 253 | // | ||
| 254 | // zip1_u32(a, b): [ 0, 1, 4, 5 ] -> w | ||
| 255 | // zip1_u32(c, d): [ 2, 3, 6, 7 ] -> x | ||
| 256 | // zip2_u32(a, b): [ 8, 9, 12, 13 ] -> y | ||
| 257 | // zip2_u32(c, d): [ 10, 11, 14, 15 ] -> z | ||
| 258 | // | ||
| 259 | // zip1_u64(w, x): [ 0, 1, 2, 3 ] -> d0 | ||
| 260 | // zip2_u64(w, x): [ 4, 5, 6, 7 ] -> d1 | ||
| 261 | // zip1_u64(y, z): [ 8, 9, 10, 11 ] -> d2 | ||
| 262 | // zip2_u64(y, z): [ 12, 13, 14, 15 ] -> d3 | ||
| 263 | // | ||
| 264 | // Continuous store of { d0, d1, d2, d3 } gives the expected order. | ||
| 265 | template <typename ScalarType> | ||
| 266 | class Merge4 final : public UnrollTwice { | ||
| 267 | public: | ||
| 268 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 269 | using VectorType = typename VecTraits::VectorType; | ||
| 270 | using Vector4Type = typename VecTraits::Vector4Type; | ||
| 271 | |||
| 272 | 4080 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
| 273 | VectorType src_d, ScalarType *dst) { | ||
| 274 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 275 | Vector4Type dst_vect; | ||
| 276 | dst_vect.val[0] = src_a; | ||
| 277 | dst_vect.val[1] = src_b; | ||
| 278 | dst_vect.val[2] = src_c; | ||
| 279 | dst_vect.val[3] = src_d; | ||
| 280 | vst4q(&dst[0], dst_vect); | ||
| 281 | #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 282 | 4080 | auto zip1_a_b = double_width(vzip1q(src_a, src_b)); | |
| 283 | 4080 | auto zip1_c_d = double_width(vzip1q(src_c, src_d)); | |
| 284 | 4080 | auto zip2_a_b = double_width(vzip2q(src_a, src_b)); | |
| 285 | 4080 | auto zip2_c_d = double_width(vzip2q(src_c, src_d)); | |
| 286 | |||
| 287 | // Compilers tend to replace zip instructions with mov, resulting in | ||
| 288 | // longer generated code. Omitting a bitcast appears to help. | ||
| 289 | using DoubleScalarType = double_element_width_t<ScalarType>; | ||
| 290 | 4080 | typename neon::VecTraits<DoubleScalarType>::Vector4Type dst_vect; | |
| 291 | 4080 | dst_vect.val[0] = vzip1q(zip1_a_b, zip1_c_d); | |
| 292 | 4080 | dst_vect.val[1] = vzip2q(zip1_a_b, zip1_c_d); | |
| 293 | 4080 | dst_vect.val[2] = vzip1q(zip2_a_b, zip2_c_d); | |
| 294 | 4080 | dst_vect.val[3] = vzip2q(zip2_a_b, zip2_c_d); | |
| 295 | 4080 | neon::VecTraits<DoubleScalarType>::store( | |
| 296 | 4080 | dst_vect, reinterpret_cast<DoubleScalarType *>(&dst[0])); | |
| 297 | |||
| 298 | #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 299 | 4080 | } | |
| 300 | |||
| 301 | 768 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
| 302 | const ScalarType *src_c, const ScalarType *src_d, | ||
| 303 | ScalarType *dst) { | ||
| 304 | 768 | dst[0] = src_a[0]; | |
| 305 | 768 | dst[1] = src_b[0]; | |
| 306 | 768 | dst[2] = src_c[0]; | |
| 307 | 768 | dst[3] = src_d[0]; | |
| 308 | 768 | } | |
| 309 | |||
| 310 | private: | ||
| 311 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 312 | // Polymorphic retinterpret_cast<>() between vector types where the element | ||
| 313 | // size is doubled. For example, if 'VectorType' is 'uint8x16_t', this | ||
| 314 | // method returns 'reinterpret_cast<uint16x8_t>(vector)'. | ||
| 315 | 16320 | static double_element_width_t<VectorType> double_width(VectorType vector) { | |
| 316 | 16320 | return reinterpret_cast<double_element_width_t<VectorType>>(vector); | |
| 317 | } | ||
| 318 | #endif | ||
| 319 | }; // end of class Merge4<ScalarType> | ||
| 320 | |||
| 321 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 322 | |||
| 323 | // Specialized 4-way merge implementation for 64-bit elements. | ||
| 324 | // | ||
| 325 | // Algorithm description | ||
| 326 | // | ||
| 327 | // Elements are identified by their intended final position in the output. | ||
| 328 | // | ||
| 329 | // VECTOR / LANE: 0 1 | ||
| 330 | // src_a: [ 0, 4 ] | ||
| 331 | // src_b: [ 1, 5 ] | ||
| 332 | // src_c: [ 2, 6 ] | ||
| 333 | // src_d: [ 3, 7 ] | ||
| 334 | // | ||
| 335 | // zip1(a, b): [ 0, 1 ] -> d0 | ||
| 336 | // zip1(c, d): [ 2, 3 ] -> d1 | ||
| 337 | // zip2(a, b): [ 4, 5 ] -> d2 | ||
| 338 | // zip2(c, d): [ 6, 7 ] -> d3 | ||
| 339 | // | ||
| 340 | // Continuous store of { d0, d1, d2, d3 } gives the expected order. | ||
| 341 | template <> | ||
| 342 | class Merge4<uint64_t> final : public UnrollTwice { | ||
| 343 | public: | ||
| 344 | using ScalarType = uint64_t; | ||
| 345 | using VecTraits = neon::VecTraits<ScalarType>; | ||
| 346 | using VectorType = typename VecTraits::VectorType; | ||
| 347 | using Vector4Type = typename VecTraits::Vector4Type; | ||
| 348 | |||
| 349 | 1360 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
| 350 | VectorType src_d, ScalarType *dst) { | ||
| 351 | 1360 | Vector4Type dst_vect; | |
| 352 | 1360 | dst_vect.val[0] = vzip1q(src_a, src_b); | |
| 353 | 1360 | dst_vect.val[1] = vzip1q(src_c, src_d); | |
| 354 | 1360 | dst_vect.val[2] = vzip2q(src_a, src_b); | |
| 355 | 1360 | dst_vect.val[3] = vzip2q(src_c, src_d); | |
| 356 | 1360 | VecTraits::store(dst_vect, &dst[0]); | |
| 357 | 1360 | } | |
| 358 | |||
| 359 | 256 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
| 360 | const ScalarType *src_c, const ScalarType *src_d, | ||
| 361 | ScalarType *dst) { | ||
| 362 | 256 | dst[0] = src_a[0]; | |
| 363 | 256 | dst[1] = src_b[0]; | |
| 364 | 256 | dst[2] = src_c[0]; | |
| 365 | 256 | dst[3] = src_d[0]; | |
| 366 | 256 | } | |
| 367 | }; // end of class Merge4<uint64_t> | ||
| 368 | |||
| 369 | #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
| 370 | |||
| 371 | // Most of the complexity comes from parameter checking. | ||
| 372 | // NOLINTBEGIN(readability-function-cognitive-complexity) | ||
| 373 | template <typename ScalarType> | ||
| 374 | 1520 | kleidicv_error_t merge(const void **srcs, const size_t *src_strides, | |
| 375 | void *dst_void, size_t dst_stride, size_t width, | ||
| 376 | size_t height, size_t channels) { | ||
| 377 |
8/8✓ Branch 0 taken 4 times.
✓ Branch 1 taken 304 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 400 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 400 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 400 times.
|
1520 | if (channels < 2) { |
| 378 | 16 | return KLEIDICV_ERROR_RANGE; | |
| 379 | } | ||
| 380 |
8/8✓ Branch 0 taken 8 times.
✓ Branch 1 taken 296 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 392 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 392 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 392 times.
|
1504 | CHECK_POINTERS(srcs, src_strides); |
| 381 |
6/6✓ Branch 0 taken 12 times.
✓ Branch 1 taken 380 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 380 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 380 times.
|
1472 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src0, srcs[0]); |
| 382 |
6/6✓ Branch 0 taken 12 times.
✓ Branch 1 taken 368 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 368 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 368 times.
|
1436 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src1, srcs[1]); |
| 383 |
6/6✓ Branch 0 taken 12 times.
✓ Branch 1 taken 356 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 356 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 356 times.
|
1400 | MAKE_POINTER_CHECK_ALIGNMENT(ScalarType, dst, dst_void); |
| 384 |
16/16✓ Branch 0 taken 12 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 284 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 332 times.
✓ Branch 6 taken 24 times.
✓ Branch 7 taken 332 times.
✓ Branch 8 taken 24 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 24 times.
✓ Branch 11 taken 332 times.
✓ Branch 12 taken 24 times.
✓ Branch 13 taken 332 times.
✓ Branch 14 taken 24 times.
✓ Branch 15 taken 332 times.
|
1364 | CHECK_POINTER_AND_STRIDE(src0, src_strides[0], height); |
| 385 |
16/16✓ Branch 0 taken 12 times.
✓ Branch 1 taken 272 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 272 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 308 times.
✓ Branch 6 taken 24 times.
✓ Branch 7 taken 308 times.
✓ Branch 8 taken 24 times.
✓ Branch 9 taken 308 times.
✓ Branch 10 taken 24 times.
✓ Branch 11 taken 308 times.
✓ Branch 12 taken 24 times.
✓ Branch 13 taken 308 times.
✓ Branch 14 taken 24 times.
✓ Branch 15 taken 308 times.
|
1280 | CHECK_POINTER_AND_STRIDE(src1, src_strides[1], height); |
| 386 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 268 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 268 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 292 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 292 times.
✓ Branch 8 taken 16 times.
✓ Branch 9 taken 292 times.
✓ Branch 10 taken 16 times.
✓ Branch 11 taken 292 times.
✓ Branch 12 taken 16 times.
✓ Branch 13 taken 292 times.
✓ Branch 14 taken 16 times.
✓ Branch 15 taken 292 times.
|
1196 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 387 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 260 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 260 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 288 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 284 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 284 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 288 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 284 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 284 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 288 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 284 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 284 times.
|
1144 | CHECK_IMAGE_SIZE(width, height); |
| 388 | |||
| 389 | 1112 | Rectangle rect{width, height}; | |
| 390 | 1112 | Rows<const ScalarType> src_a_rows{src0, src_strides[0]}; | |
| 391 | 1112 | Rows<const ScalarType> src_b_rows{src1, src_strides[1]}; | |
| 392 | 1112 | Rows<ScalarType> dst_rows{dst, dst_stride, channels}; | |
| 393 | |||
| 394 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 44 times.
✓ Branch 2 taken 72 times.
✓ Branch 3 taken 140 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 44 times.
✓ Branch 6 taken 80 times.
✓ Branch 7 taken 156 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 44 times.
✓ Branch 10 taken 80 times.
✓ Branch 11 taken 156 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 44 times.
✓ Branch 14 taken 80 times.
✓ Branch 15 taken 156 times.
|
1112 | switch (channels) { |
| 395 | case 2: { | ||
| 396 | 176 | Merge2<ScalarType> operation; | |
| 397 | 176 | apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, | |
| 398 | dst_rows); | ||
| 399 | 176 | } break; | |
| 400 | |||
| 401 | case 3: { | ||
| 402 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 76 times.
|
312 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]); |
| 403 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 68 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 68 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 68 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 68 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 68 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 68 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 68 times.
|
300 | CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height); |
| 404 | 272 | Merge3<ScalarType> operation; | |
| 405 | 272 | Rows<const ScalarType> src_c_rows{src2, src_strides[2]}; | |
| 406 | 272 | apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, | |
| 407 | src_c_rows, dst_rows); | ||
| 408 |
8/8✓ Branch 0 taken 4 times.
✓ Branch 1 taken 68 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 68 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 68 times.
|
300 | } break; |
| 409 | |||
| 410 | case 4: { | ||
| 411 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 152 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 152 times.
|
608 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]); |
| 412 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 148 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 148 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 148 times.
|
596 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src3, srcs[3]); |
| 413 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 136 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 136 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 140 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 140 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 140 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 140 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 140 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 140 times.
|
584 | CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height); |
| 414 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 132 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 132 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 132 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 132 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 132 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 132 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 132 times.
|
556 | CHECK_POINTER_AND_STRIDE(src3, src_strides[3], height); |
| 415 | 528 | Merge4<ScalarType> operation; | |
| 416 | 528 | Rows<const ScalarType> src_c_rows{src2, src_strides[2]}; | |
| 417 | 528 | Rows<const ScalarType> src_d_rows{src3, src_strides[3]}; | |
| 418 | 528 | apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, | |
| 419 | src_c_rows, src_d_rows, dst_rows); | ||
| 420 |
8/8✓ Branch 0 taken 8 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 132 times.
✓ Branch 4 taken 20 times.
✓ Branch 5 taken 132 times.
✓ Branch 6 taken 20 times.
✓ Branch 7 taken 132 times.
|
596 | } break; |
| 421 | |||
| 422 | default: | ||
| 423 | 16 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 424 | } | ||
| 425 | 976 | return KLEIDICV_OK; | |
| 426 | 1520 | } | |
| 427 | // NOLINTEND(readability-function-cognitive-complexity) | ||
| 428 | |||
| 429 | KLEIDICV_TARGET_FN_ATTRS | ||
| 430 | 1524 | kleidicv_error_t merge(const void **srcs, const size_t *src_strides, void *dst, | |
| 431 | size_t dst_stride, size_t width, size_t height, | ||
| 432 | size_t channels, size_t element_size) { | ||
| 433 |
5/5✓ Branch 0 taken 404 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 308 times.
✓ Branch 3 taken 404 times.
✓ Branch 4 taken 404 times.
|
1524 | switch (element_size) { |
| 434 | case sizeof(uint8_t): | ||
| 435 | 616 | return merge<uint8_t>(srcs, src_strides, dst, dst_stride, width, height, | |
| 436 | 308 | channels); | |
| 437 | |||
| 438 | case sizeof(uint16_t): | ||
| 439 | 808 | return merge<uint16_t>(srcs, src_strides, dst, dst_stride, width, height, | |
| 440 | 404 | channels); | |
| 441 | |||
| 442 | case sizeof(uint32_t): | ||
| 443 | 808 | return merge<uint32_t>(srcs, src_strides, dst, dst_stride, width, height, | |
| 444 | 404 | channels); | |
| 445 | |||
| 446 | case sizeof(uint64_t): | ||
| 447 | 808 | return merge<uint64_t>(srcs, src_strides, dst, dst_stride, width, height, | |
| 448 | 404 | channels); | |
| 449 | |||
| 450 | default: | ||
| 451 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 452 | } | ||
| 453 | 1524 | } | |
| 454 | |||
| 455 | } // namespace kleidicv::neon | ||
| 456 |