Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include "kleidicv/conversions/merge.h" | ||
6 | #include "kleidicv/kleidicv.h" | ||
7 | #include "kleidicv/neon.h" | ||
8 | |||
9 | namespace kleidicv::neon { | ||
10 | |||
11 | // ---------------------------------------- | ||
12 | // ------------ Two-way merge ------------- | ||
13 | // ---------------------------------------- | ||
14 | |||
15 | // Generic 2-way merge implementation. | ||
16 | // | ||
17 | // Algorithm description | ||
18 | // | ||
19 | // Elements are identified by their intended final position in the output. | ||
20 | // The description is for 32-bit elements, but it works just the same way | ||
21 | // for different element sizes. | ||
22 | // | ||
23 | // VECTOR / LANE: 0 1 2 3 | ||
24 | // src_a: [ 0, 2, 4, 6 ] | ||
25 | // src_b: [ 1, 3, 5, 7 ] | ||
26 | // | ||
27 | // zip1(a, b): [ 0, 1, 2, 3 ] -> d0 | ||
28 | // zip2(a, b): [ 4, 5, 6, 7 ] -> d1 | ||
29 | // | ||
30 | // Continuous store of { d0, d1 } gives the expected order. | ||
31 | template <typename ScalarType> | ||
32 | class Merge2 final : public UnrollTwice { | ||
33 | public: | ||
34 | using VecTraits = neon::VecTraits<ScalarType>; | ||
35 | using VectorType = typename VecTraits::VectorType; | ||
36 | using Vector2Type = typename VecTraits::Vector2Type; | ||
37 | |||
38 | 960 | void vector_path(VectorType src_a, VectorType src_b, ScalarType *dst) { | |
39 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
40 | Vector2Type dst_vect; | ||
41 | dst_vect.val[0] = src_a; | ||
42 | dst_vect.val[1] = src_b; | ||
43 | vst2q(&dst[0], dst_vect); | ||
44 | #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
45 | 960 | Vector2Type dst_vect; | |
46 | 960 | dst_vect.val[0] = vzip1q(src_a, src_b); | |
47 | 960 | dst_vect.val[1] = vzip2q(src_a, src_b); | |
48 | 960 | VecTraits::store(dst_vect, &dst[0]); | |
49 | |||
50 | #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
51 | 960 | } | |
52 | |||
53 | 192 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
54 | ScalarType *dst) { | ||
55 | 192 | dst[0] = src_a[0]; | |
56 | 192 | dst[1] = src_b[0]; | |
57 | 192 | } | |
58 | }; // end of class Merge2<ScalarType> | ||
59 | |||
60 | // ---------------------------------------- | ||
61 | // ---------- Three-way merge ------------- | ||
62 | // ---------------------------------------- | ||
63 | |||
64 | template <typename ScalarType> | ||
65 | class Merge3 final : public UnrollTwice { | ||
66 | public: | ||
67 | using VecTraits = neon::VecTraits<ScalarType>; | ||
68 | using VectorType = typename VecTraits::VectorType; | ||
69 | using Vector3Type = typename VecTraits::Vector3Type; | ||
70 | |||
71 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
72 | |||
73 | 102 | Merge3() : table_indices_{} { | |
74 | 102 | neon::VecTraits<uint8_t>::load(lookup_table(ScalarType()), table_indices_); | |
75 | 102 | } | |
76 | |||
77 | #endif | ||
78 | |||
79 | 864 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
80 | ScalarType *dst) { | ||
81 | 864 | Vector3Type dst_vect; | |
82 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
83 | dst_vect.val[0] = src_a; | ||
84 | dst_vect.val[1] = src_b; | ||
85 | dst_vect.val[2] = src_c; | ||
86 | vst3q(&dst[0], dst_vect); | ||
87 | #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
88 | 864 | uint8x16x3_t src_vect; | |
89 | 864 | src_vect.val[0] = vreinterpretq_u8(src_a); | |
90 | 864 | src_vect.val[1] = vreinterpretq_u8(src_b); | |
91 | 864 | src_vect.val[2] = vreinterpretq_u8(src_c); | |
92 | 864 | dst_vect.val[0] = vqtbl3q_u8(src_vect, table_indices_.val[0]); | |
93 | 864 | dst_vect.val[1] = vqtbl3q_u8(src_vect, table_indices_.val[1]); | |
94 | 864 | dst_vect.val[2] = vqtbl3q_u8(src_vect, table_indices_.val[2]); | |
95 | 864 | VecTraits::store(dst_vect, &dst[0]); | |
96 | #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
97 | 864 | } | |
98 | |||
99 | 192 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
100 | const ScalarType *src_c, ScalarType *dst) { | ||
101 | 192 | dst[0] = src_a[0]; | |
102 | 192 | dst[1] = src_b[0]; | |
103 | 192 | dst[2] = src_c[0]; | |
104 | 192 | } | |
105 | |||
106 | private: | ||
107 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
108 | 51 | static uint8_t *lookup_table(uint8_t) { | |
109 | // clang-format off | ||
110 | static uint8_t kIndices[48] = { | ||
111 | 0, 16, 32, 1, 17, 33, 2, 18, 34, 3, 19, 35, 4, 20, 36, 5, | ||
112 | 21, 37, 6, 22, 38, 7, 23, 39, 8, 24, 40, 9, 25, 41, 10, 26, | ||
113 | 42, 11, 27, 43, 12, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, | ||
114 | }; | ||
115 | 51 | return &kIndices[0]; | |
116 | } | ||
117 | |||
118 | // Lookup table for 16-bit inputs. | ||
119 | 51 | static uint8_t *lookup_table(uint16_t) { | |
120 | // clang-format off | ||
121 | static uint8_t kIndices[48] = { | ||
122 | 0, 1, 16, 17, 32, 33, 2, 3, 18, 19, 34, 35, 4, 5, 20, 21, | ||
123 | 36, 37, 6, 7, 22, 23, 38, 39, 8, 9, 24, 25, 40, 41, 10, 11, | ||
124 | 26, 27, 42, 43, 12, 13, 28, 29, 44, 45, 14, 15, 30, 31, 46, 47, | ||
125 | }; | ||
126 | // clang-format on | ||
127 | 51 | return &kIndices[0]; | |
128 | } | ||
129 | |||
130 | uint8x16x3_t table_indices_; | ||
131 | #endif | ||
132 | }; // end of class Merge3<ScalarType> | ||
133 | |||
134 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
135 | |||
136 | // Specialized 3-way merge implementation for 32-bit elements. | ||
137 | // | ||
138 | // Algorithm description | ||
139 | // | ||
140 | // Elements are identified by their intended final position in the output. | ||
141 | // | ||
142 | // VECTOR / LANE: 0 1 2 3 | ||
143 | // src_a: [ 0, 3, 6, 9 ] | ||
144 | // src_b: [ 1, 4, 7, 10 ] | ||
145 | // src_c: [ 2, 5, 8, 11 ] | ||
146 | // | ||
147 | // trn2(a, b): [ 3, 4, 9, 10 ] -> w | ||
148 | // trn1(c, w): [ 2, 3, 8, 9 ] -> x | ||
149 | // trn2(w, c): [ 4, 5, 10, 11 ] -> y | ||
150 | // trn1(a, b): [ 0, 1, 6, 7 ] -> z | ||
151 | // | ||
152 | // zip1_u64(z, x): [ 0, 1, 2, 3 ] -> d0 | ||
153 | // [ y_u64[0], z_u64[1] ]: [ 4, 5, 6, 7 ] -> d1 | ||
154 | // zip2_u64(x, y): [ 8, 9, 10, 11 ] -> d2 | ||
155 | // | ||
156 | // Continuous store of { d0, d1, d2 } gives the expected order. | ||
157 | template <> | ||
158 | class Merge3<uint32_t> final : public UnrollTwice { | ||
159 | public: | ||
160 | using ScalarType = uint32_t; | ||
161 | using VecTraits = neon::VecTraits<ScalarType>; | ||
162 | using VectorType = typename VecTraits::VectorType; | ||
163 | using Vector3Type = typename VecTraits::Vector3Type; | ||
164 | |||
165 | 432 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
166 | ScalarType *dst) { | ||
167 | 432 | uint32x4_t w = vtrn2q_u32(src_a, src_b); | |
168 | 432 | uint32x4_t x = vtrn1q_u32(src_c, w); | |
169 | 432 | uint32x4_t y = vtrn2q_u32(w, src_c); | |
170 | 432 | uint32x4_t z = vtrn1q_u32(src_a, src_b); | |
171 | |||
172 | 432 | uint32x4_t dst_vect_0 = vzip1q_u64(z, x); | |
173 | 432 | uint64x2_t dst_vect_1 = y; | |
174 | 432 | dst_vect_1[1] = vreinterpretq_u64_u32(z)[1]; | |
175 | 432 | uint32x4_t dst_vect_2 = vzip2q_u64(x, y); | |
176 | |||
177 | // Not using vst1q_u32_x3, becuse the requirement on continuous vector | ||
178 | // register allocation may result in longer code. | ||
179 | 432 | vst1q_u32(&dst[0 * VecTraits::num_lanes()], dst_vect_0); | |
180 | 432 | vst1q_u32(&dst[1 * VecTraits::num_lanes()], dst_vect_1); | |
181 | 432 | vst1q_u32(&dst[2 * VecTraits::num_lanes()], dst_vect_2); | |
182 | 432 | } | |
183 | |||
184 | 96 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
185 | const ScalarType *src_c, ScalarType *dst) { | ||
186 | 96 | dst[0] = src_a[0]; | |
187 | 96 | dst[1] = src_b[0]; | |
188 | 96 | dst[2] = src_c[0]; | |
189 | 96 | } | |
190 | }; // end of class Merge3<uint32_t> | ||
191 | |||
192 | // Specialized 3-way merge implementation for 64-bit elements. | ||
193 | // | ||
194 | // Algorithm description | ||
195 | // | ||
196 | // Elements are identified by their intended final position in the output. | ||
197 | // | ||
198 | // VECTOR / LANE: 0 1 | ||
199 | // src_a: [ 0, 3 ] | ||
200 | // src_b: [ 1, 4 ] | ||
201 | // src_c: [ 2, 5 ] | ||
202 | // | ||
203 | // zip1(a, b): [ 0, 1 ] -> d0 | ||
204 | // [ src_c[0], src_a[1] ]: [ 2, 3 ] -> d1 | ||
205 | // zip2(b, c): [ 4, 5 ] -> d2 | ||
206 | // | ||
207 | // Continuous store of { d0, d1, d2 } gives the expected order. | ||
208 | template <> | ||
209 | class Merge3<uint64_t> final : public UnrollTwice { | ||
210 | public: | ||
211 | using ScalarType = uint64_t; | ||
212 | using VecTraits = neon::VecTraits<ScalarType>; | ||
213 | using VectorType = typename VecTraits::VectorType; | ||
214 | |||
215 | 432 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
216 | ScalarType *dst) { | ||
217 | 432 | uint64x2x3_t dst_vect; | |
218 | 432 | dst_vect.val[0] = vzip1q_u64(src_a, src_b); | |
219 | 432 | dst_vect.val[1] = src_c; | |
220 | 432 | dst_vect.val[1][1] = src_a[1]; | |
221 | 432 | dst_vect.val[2] = vzip2q_u64(src_b, src_c); | |
222 | |||
223 | 432 | VecTraits::store(dst_vect, &dst[0]); | |
224 | 432 | } | |
225 | |||
226 | 96 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
227 | const ScalarType *src_c, ScalarType *dst) { | ||
228 | 96 | dst[0] = src_a[0]; | |
229 | 96 | dst[1] = src_b[0]; | |
230 | 96 | dst[2] = src_c[0]; | |
231 | 96 | } | |
232 | }; // end of class Merge3<uint64_t> | ||
233 | |||
234 | #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
235 | |||
236 | // ---------------------------------------- | ||
237 | // ----------- Four-way merge ------------- | ||
238 | // ---------------------------------------- | ||
239 | |||
240 | // Generic 4-way merge implementation. | ||
241 | // | ||
242 | // Algorithm description | ||
243 | // | ||
244 | // Elements are identified by their intended final position in the output. | ||
245 | // The description is for 32-bit elements, but it works just the same way | ||
246 | // for smaller element sizes. | ||
247 | // | ||
248 | // VECTOR / LANE: 0 1 2 3 | ||
249 | // src_a: [ 0, 4, 8, 12 ] | ||
250 | // src_b: [ 1, 5, 9, 13 ] | ||
251 | // src_c: [ 2, 6, 10, 14 ] | ||
252 | // src_d: [ 3, 7, 11, 15 ] | ||
253 | // | ||
254 | // zip1_u32(a, b): [ 0, 1, 4, 5 ] -> w | ||
255 | // zip1_u32(c, d): [ 2, 3, 6, 7 ] -> x | ||
256 | // zip2_u32(a, b): [ 8, 9, 12, 13 ] -> y | ||
257 | // zip2_u32(c, d): [ 10, 11, 14, 15 ] -> z | ||
258 | // | ||
259 | // zip1_u64(w, x): [ 0, 1, 2, 3 ] -> d0 | ||
260 | // zip2_u64(w, x): [ 4, 5, 6, 7 ] -> d1 | ||
261 | // zip1_u64(y, z): [ 8, 9, 10, 11 ] -> d2 | ||
262 | // zip2_u64(y, z): [ 12, 13, 14, 15 ] -> d3 | ||
263 | // | ||
264 | // Continuous store of { d0, d1, d2, d3 } gives the expected order. | ||
265 | template <typename ScalarType> | ||
266 | class Merge4 final : public UnrollTwice { | ||
267 | public: | ||
268 | using VecTraits = neon::VecTraits<ScalarType>; | ||
269 | using VectorType = typename VecTraits::VectorType; | ||
270 | using Vector4Type = typename VecTraits::Vector4Type; | ||
271 | |||
272 | 2448 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
273 | VectorType src_d, ScalarType *dst) { | ||
274 | #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
275 | Vector4Type dst_vect; | ||
276 | dst_vect.val[0] = src_a; | ||
277 | dst_vect.val[1] = src_b; | ||
278 | dst_vect.val[2] = src_c; | ||
279 | dst_vect.val[3] = src_d; | ||
280 | vst4q(&dst[0], dst_vect); | ||
281 | #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
282 | 2448 | auto zip1_a_b = double_width(vzip1q(src_a, src_b)); | |
283 | 2448 | auto zip1_c_d = double_width(vzip1q(src_c, src_d)); | |
284 | 2448 | auto zip2_a_b = double_width(vzip2q(src_a, src_b)); | |
285 | 2448 | auto zip2_c_d = double_width(vzip2q(src_c, src_d)); | |
286 | |||
287 | // Compilers tend to replace zip instructions with mov, resulting in | ||
288 | // longer generated code. Omitting a bitcast appears to help. | ||
289 | using DoubleScalarType = double_element_width_t<ScalarType>; | ||
290 | 2448 | typename neon::VecTraits<DoubleScalarType>::Vector4Type dst_vect; | |
291 | 2448 | dst_vect.val[0] = vzip1q(zip1_a_b, zip1_c_d); | |
292 | 2448 | dst_vect.val[1] = vzip2q(zip1_a_b, zip1_c_d); | |
293 | 2448 | dst_vect.val[2] = vzip1q(zip2_a_b, zip2_c_d); | |
294 | 2448 | dst_vect.val[3] = vzip2q(zip2_a_b, zip2_c_d); | |
295 | 2448 | neon::VecTraits<DoubleScalarType>::store( | |
296 | 2448 | dst_vect, reinterpret_cast<DoubleScalarType *>(&dst[0])); | |
297 | |||
298 | #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
299 | 2448 | } | |
300 | |||
301 | 576 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
302 | const ScalarType *src_c, const ScalarType *src_d, | ||
303 | ScalarType *dst) { | ||
304 | 576 | dst[0] = src_a[0]; | |
305 | 576 | dst[1] = src_b[0]; | |
306 | 576 | dst[2] = src_c[0]; | |
307 | 576 | dst[3] = src_d[0]; | |
308 | 576 | } | |
309 | |||
310 | private: | ||
311 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
312 | // Polymorphic retinterpret_cast<>() between vector types where the element | ||
313 | // size is doubled. For example, if 'VectorType' is 'uint8x16_t', this | ||
314 | // method returns 'reinterpret_cast<uint16x8_t>(vector)'. | ||
315 | 9792 | static double_element_width_t<VectorType> double_width(VectorType vector) { | |
316 | 9792 | return reinterpret_cast<double_element_width_t<VectorType>>(vector); | |
317 | } | ||
318 | #endif | ||
319 | }; // end of class Merge4<ScalarType> | ||
320 | |||
321 | #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
322 | |||
323 | // Specialized 4-way merge implementation for 64-bit elements. | ||
324 | // | ||
325 | // Algorithm description | ||
326 | // | ||
327 | // Elements are identified by their intended final position in the output. | ||
328 | // | ||
329 | // VECTOR / LANE: 0 1 | ||
330 | // src_a: [ 0, 4 ] | ||
331 | // src_b: [ 1, 5 ] | ||
332 | // src_c: [ 2, 6 ] | ||
333 | // src_d: [ 3, 7 ] | ||
334 | // | ||
335 | // zip1(a, b): [ 0, 1 ] -> d0 | ||
336 | // zip1(c, d): [ 2, 3 ] -> d1 | ||
337 | // zip2(a, b): [ 4, 5 ] -> d2 | ||
338 | // zip2(c, d): [ 6, 7 ] -> d3 | ||
339 | // | ||
340 | // Continuous store of { d0, d1, d2, d3 } gives the expected order. | ||
341 | template <> | ||
342 | class Merge4<uint64_t> final : public UnrollTwice { | ||
343 | public: | ||
344 | using ScalarType = uint64_t; | ||
345 | using VecTraits = neon::VecTraits<ScalarType>; | ||
346 | using VectorType = typename VecTraits::VectorType; | ||
347 | using Vector4Type = typename VecTraits::Vector4Type; | ||
348 | |||
349 | 816 | void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, | |
350 | VectorType src_d, ScalarType *dst) { | ||
351 | 816 | Vector4Type dst_vect; | |
352 | 816 | dst_vect.val[0] = vzip1q(src_a, src_b); | |
353 | 816 | dst_vect.val[1] = vzip1q(src_c, src_d); | |
354 | 816 | dst_vect.val[2] = vzip2q(src_a, src_b); | |
355 | 816 | dst_vect.val[3] = vzip2q(src_c, src_d); | |
356 | 816 | VecTraits::store(dst_vect, &dst[0]); | |
357 | 816 | } | |
358 | |||
359 | 192 | void scalar_path(const ScalarType *src_a, const ScalarType *src_b, | |
360 | const ScalarType *src_c, const ScalarType *src_d, | ||
361 | ScalarType *dst) { | ||
362 | 192 | dst[0] = src_a[0]; | |
363 | 192 | dst[1] = src_b[0]; | |
364 | 192 | dst[2] = src_c[0]; | |
365 | 192 | dst[3] = src_d[0]; | |
366 | 192 | } | |
367 | }; // end of class Merge4<uint64_t> | ||
368 | |||
369 | #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE | ||
370 | |||
371 | // Most of the complexity comes from parameter checking. | ||
372 | // NOLINTBEGIN(readability-function-cognitive-complexity) | ||
373 | template <typename ScalarType> | ||
374 | 1140 | kleidicv_error_t merge(const void **srcs, const size_t *src_strides, | |
375 | void *dst_void, size_t dst_stride, size_t width, | ||
376 | size_t height, size_t channels) { | ||
377 |
8/8✓ Branch 0 taken 3 times.
✓ Branch 1 taken 228 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 300 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 300 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 300 times.
|
1140 | if (channels < 2) { |
378 | 12 | return KLEIDICV_ERROR_RANGE; | |
379 | } | ||
380 |
8/8✓ Branch 0 taken 6 times.
✓ Branch 1 taken 222 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 294 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 294 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 294 times.
|
1128 | CHECK_POINTERS(srcs, src_strides); |
381 |
6/6✓ Branch 0 taken 9 times.
✓ Branch 1 taken 285 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 285 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 285 times.
|
1104 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src0, srcs[0]); |
382 |
6/6✓ Branch 0 taken 9 times.
✓ Branch 1 taken 276 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 276 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 276 times.
|
1077 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src1, srcs[1]); |
383 |
6/6✓ Branch 0 taken 9 times.
✓ Branch 1 taken 267 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 267 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 267 times.
|
1050 | MAKE_POINTER_CHECK_ALIGNMENT(ScalarType, dst, dst_void); |
384 |
16/16✓ Branch 0 taken 9 times.
✓ Branch 1 taken 213 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 213 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 249 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 249 times.
✓ Branch 8 taken 18 times.
✓ Branch 9 taken 249 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 249 times.
✓ Branch 12 taken 18 times.
✓ Branch 13 taken 249 times.
✓ Branch 14 taken 18 times.
✓ Branch 15 taken 249 times.
|
1023 | CHECK_POINTER_AND_STRIDE(src0, src_strides[0], height); |
385 |
16/16✓ Branch 0 taken 9 times.
✓ Branch 1 taken 204 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 204 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 231 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 231 times.
✓ Branch 8 taken 18 times.
✓ Branch 9 taken 231 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 231 times.
✓ Branch 12 taken 18 times.
✓ Branch 13 taken 231 times.
✓ Branch 14 taken 18 times.
✓ Branch 15 taken 231 times.
|
960 | CHECK_POINTER_AND_STRIDE(src1, src_strides[1], height); |
386 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 201 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 201 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 219 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 219 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 219 times.
✓ Branch 10 taken 12 times.
✓ Branch 11 taken 219 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 219 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 219 times.
|
897 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
387 |
24/24✓ Branch 0 taken 3 times.
✓ Branch 1 taken 198 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 195 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 216 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 213 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 213 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 216 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 213 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 213 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 216 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 213 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 213 times.
|
858 | CHECK_IMAGE_SIZE(width, height); |
388 | |||
389 | 834 | Rectangle rect{width, height}; | |
390 | 834 | Rows<const ScalarType> src_a_rows{src0, src_strides[0]}; | |
391 | 834 | Rows<const ScalarType> src_b_rows{src1, src_strides[1]}; | |
392 | 834 | Rows<ScalarType> dst_rows{dst, dst_stride, channels}; | |
393 | |||
394 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 33 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 105 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 33 times.
✓ Branch 6 taken 60 times.
✓ Branch 7 taken 117 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 33 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 117 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 33 times.
✓ Branch 14 taken 60 times.
✓ Branch 15 taken 117 times.
|
834 | switch (channels) { |
395 | case 2: { | ||
396 | 132 | Merge2<ScalarType> operation; | |
397 | 132 | apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, | |
398 | dst_rows); | ||
399 | 132 | } break; | |
400 | |||
401 | case 3: { | ||
402 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 57 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 57 times.
|
234 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]); |
403 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 51 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 51 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 51 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 51 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 51 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 51 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 51 times.
|
225 | CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height); |
404 | 204 | Merge3<ScalarType> operation; | |
405 | 204 | Rows<const ScalarType> src_c_rows{src2, src_strides[2]}; | |
406 | 204 | apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, | |
407 | src_c_rows, dst_rows); | ||
408 |
8/8✓ Branch 0 taken 3 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 51 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 51 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 51 times.
|
225 | } break; |
409 | |||
410 | case 4: { | ||
411 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 114 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 114 times.
|
456 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]); |
412 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 111 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 111 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 111 times.
|
447 | MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src3, srcs[3]); |
413 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 102 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 102 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 105 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 105 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 105 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 105 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 105 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 105 times.
|
438 | CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height); |
414 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 99 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 99 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 99 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 99 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 99 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 99 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 99 times.
|
417 | CHECK_POINTER_AND_STRIDE(src3, src_strides[3], height); |
415 | 396 | Merge4<ScalarType> operation; | |
416 | 396 | Rows<const ScalarType> src_c_rows{src2, src_strides[2]}; | |
417 | 396 | Rows<const ScalarType> src_d_rows{src3, src_strides[3]}; | |
418 | 396 | apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, | |
419 | src_c_rows, src_d_rows, dst_rows); | ||
420 |
8/8✓ Branch 0 taken 6 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 15 times.
✓ Branch 3 taken 99 times.
✓ Branch 4 taken 15 times.
✓ Branch 5 taken 99 times.
✓ Branch 6 taken 15 times.
✓ Branch 7 taken 99 times.
|
447 | } break; |
421 | |||
422 | default: | ||
423 | 12 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
424 | } | ||
425 | 732 | return KLEIDICV_OK; | |
426 | 1140 | } | |
427 | // NOLINTEND(readability-function-cognitive-complexity) | ||
428 | |||
429 | KLEIDICV_TARGET_FN_ATTRS | ||
430 | 1143 | kleidicv_error_t merge(const void **srcs, const size_t *src_strides, void *dst, | |
431 | size_t dst_stride, size_t width, size_t height, | ||
432 | size_t channels, size_t element_size) { | ||
433 |
5/5✓ Branch 0 taken 303 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 231 times.
✓ Branch 3 taken 303 times.
✓ Branch 4 taken 303 times.
|
1143 | switch (element_size) { |
434 | case sizeof(uint8_t): | ||
435 | 462 | return merge<uint8_t>(srcs, src_strides, dst, dst_stride, width, height, | |
436 | 231 | channels); | |
437 | |||
438 | case sizeof(uint16_t): | ||
439 | 606 | return merge<uint16_t>(srcs, src_strides, dst, dst_stride, width, height, | |
440 | 303 | channels); | |
441 | |||
442 | case sizeof(uint32_t): | ||
443 | 606 | return merge<uint32_t>(srcs, src_strides, dst, dst_stride, width, height, | |
444 | 303 | channels); | |
445 | |||
446 | case sizeof(uint64_t): | ||
447 | 606 | return merge<uint64_t>(srcs, src_strides, dst, dst_stride, width, height, | |
448 | 303 | channels); | |
449 | |||
450 | default: | ||
451 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
452 | } | ||
453 | 1143 | } | |
454 | |||
455 | } // namespace kleidicv::neon | ||
456 |