KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/merge_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 148 148 100.0%
Functions: 36 36 100.0%
Branches: 209 209 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/conversions/merge.h"
6 #include "kleidicv/kleidicv.h"
7 #include "kleidicv/neon.h"
8
9 namespace kleidicv::neon {
10
11 // ----------------------------------------
12 // ------------ Two-way merge -------------
13 // ----------------------------------------
14
15 // Generic 2-way merge implementation.
16 //
17 // Algorithm description
18 //
19 // Elements are identified by their intended final position in the output.
20 // The description is for 32-bit elements, but it works just the same way
21 // for different element sizes.
22 //
23 // VECTOR / LANE: 0 1 2 3
24 // src_a: [ 0, 2, 4, 6 ]
25 // src_b: [ 1, 3, 5, 7 ]
26 //
27 // zip1(a, b): [ 0, 1, 2, 3 ] -> d0
28 // zip2(a, b): [ 4, 5, 6, 7 ] -> d1
29 //
30 // Continuous store of { d0, d1 } gives the expected order.
31 template <typename ScalarType>
32 class Merge2 final : public UnrollTwice {
33 public:
34 using VecTraits = neon::VecTraits<ScalarType>;
35 using VectorType = typename VecTraits::VectorType;
36 using Vector2Type = typename VecTraits::Vector2Type;
37
38 960 void vector_path(VectorType src_a, VectorType src_b, ScalarType *dst) {
39 #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
40 Vector2Type dst_vect;
41 dst_vect.val[0] = src_a;
42 dst_vect.val[1] = src_b;
43 vst2q(&dst[0], dst_vect);
44 #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
45 960 Vector2Type dst_vect;
46 960 dst_vect.val[0] = vzip1q(src_a, src_b);
47 960 dst_vect.val[1] = vzip2q(src_a, src_b);
48 960 VecTraits::store(dst_vect, &dst[0]);
49
50 #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
51 960 }
52
53 192 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
54 ScalarType *dst) {
55 192 dst[0] = src_a[0];
56 192 dst[1] = src_b[0];
57 192 }
58 }; // end of class Merge2<ScalarType>
59
60 // ----------------------------------------
61 // ---------- Three-way merge -------------
62 // ----------------------------------------
63
64 template <typename ScalarType>
65 class Merge3 final : public UnrollTwice {
66 public:
67 using VecTraits = neon::VecTraits<ScalarType>;
68 using VectorType = typename VecTraits::VectorType;
69 using Vector3Type = typename VecTraits::Vector3Type;
70
71 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
72
73 102 Merge3() : table_indices_{} {
74 102 neon::VecTraits<uint8_t>::load(lookup_table(ScalarType()), table_indices_);
75 102 }
76
77 #endif
78
79 864 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
80 ScalarType *dst) {
81 864 Vector3Type dst_vect;
82 #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
83 dst_vect.val[0] = src_a;
84 dst_vect.val[1] = src_b;
85 dst_vect.val[2] = src_c;
86 vst3q(&dst[0], dst_vect);
87 #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
88 864 uint8x16x3_t src_vect;
89 864 src_vect.val[0] = vreinterpretq_u8(src_a);
90 864 src_vect.val[1] = vreinterpretq_u8(src_b);
91 864 src_vect.val[2] = vreinterpretq_u8(src_c);
92 864 dst_vect.val[0] = vqtbl3q_u8(src_vect, table_indices_.val[0]);
93 864 dst_vect.val[1] = vqtbl3q_u8(src_vect, table_indices_.val[1]);
94 864 dst_vect.val[2] = vqtbl3q_u8(src_vect, table_indices_.val[2]);
95 864 VecTraits::store(dst_vect, &dst[0]);
96 #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
97 864 }
98
99 192 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
100 const ScalarType *src_c, ScalarType *dst) {
101 192 dst[0] = src_a[0];
102 192 dst[1] = src_b[0];
103 192 dst[2] = src_c[0];
104 192 }
105
106 private:
107 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
108 51 static uint8_t *lookup_table(uint8_t) {
109 // clang-format off
110 static uint8_t kIndices[48] = {
111 0, 16, 32, 1, 17, 33, 2, 18, 34, 3, 19, 35, 4, 20, 36, 5,
112 21, 37, 6, 22, 38, 7, 23, 39, 8, 24, 40, 9, 25, 41, 10, 26,
113 42, 11, 27, 43, 12, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47,
114 };
115 51 return &kIndices[0];
116 }
117
118 // Lookup table for 16-bit inputs.
119 51 static uint8_t *lookup_table(uint16_t) {
120 // clang-format off
121 static uint8_t kIndices[48] = {
122 0, 1, 16, 17, 32, 33, 2, 3, 18, 19, 34, 35, 4, 5, 20, 21,
123 36, 37, 6, 7, 22, 23, 38, 39, 8, 9, 24, 25, 40, 41, 10, 11,
124 26, 27, 42, 43, 12, 13, 28, 29, 44, 45, 14, 15, 30, 31, 46, 47,
125 };
126 // clang-format on
127 51 return &kIndices[0];
128 }
129
130 uint8x16x3_t table_indices_;
131 #endif
132 }; // end of class Merge3<ScalarType>
133
134 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
135
136 // Specialized 3-way merge implementation for 32-bit elements.
137 //
138 // Algorithm description
139 //
140 // Elements are identified by their intended final position in the output.
141 //
142 // VECTOR / LANE: 0 1 2 3
143 // src_a: [ 0, 3, 6, 9 ]
144 // src_b: [ 1, 4, 7, 10 ]
145 // src_c: [ 2, 5, 8, 11 ]
146 //
147 // trn2(a, b): [ 3, 4, 9, 10 ] -> w
148 // trn1(c, w): [ 2, 3, 8, 9 ] -> x
149 // trn2(w, c): [ 4, 5, 10, 11 ] -> y
150 // trn1(a, b): [ 0, 1, 6, 7 ] -> z
151 //
152 // zip1_u64(z, x): [ 0, 1, 2, 3 ] -> d0
153 // [ y_u64[0], z_u64[1] ]: [ 4, 5, 6, 7 ] -> d1
154 // zip2_u64(x, y): [ 8, 9, 10, 11 ] -> d2
155 //
156 // Continuous store of { d0, d1, d2 } gives the expected order.
157 template <>
158 class Merge3<uint32_t> final : public UnrollTwice {
159 public:
160 using ScalarType = uint32_t;
161 using VecTraits = neon::VecTraits<ScalarType>;
162 using VectorType = typename VecTraits::VectorType;
163 using Vector3Type = typename VecTraits::Vector3Type;
164
165 432 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
166 ScalarType *dst) {
167 432 uint32x4_t w = vtrn2q_u32(src_a, src_b);
168 432 uint32x4_t x = vtrn1q_u32(src_c, w);
169 432 uint32x4_t y = vtrn2q_u32(w, src_c);
170 432 uint32x4_t z = vtrn1q_u32(src_a, src_b);
171
172 432 uint32x4_t dst_vect_0 = vzip1q_u64(z, x);
173 432 uint64x2_t dst_vect_1 = y;
174 432 dst_vect_1[1] = vreinterpretq_u64_u32(z)[1];
175 432 uint32x4_t dst_vect_2 = vzip2q_u64(x, y);
176
177 // Not using vst1q_u32_x3, becuse the requirement on continuous vector
178 // register allocation may result in longer code.
179 432 vst1q_u32(&dst[0 * VecTraits::num_lanes()], dst_vect_0);
180 432 vst1q_u32(&dst[1 * VecTraits::num_lanes()], dst_vect_1);
181 432 vst1q_u32(&dst[2 * VecTraits::num_lanes()], dst_vect_2);
182 432 }
183
184 96 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
185 const ScalarType *src_c, ScalarType *dst) {
186 96 dst[0] = src_a[0];
187 96 dst[1] = src_b[0];
188 96 dst[2] = src_c[0];
189 96 }
190 }; // end of class Merge3<uint32_t>
191
192 // Specialized 3-way merge implementation for 64-bit elements.
193 //
194 // Algorithm description
195 //
196 // Elements are identified by their intended final position in the output.
197 //
198 // VECTOR / LANE: 0 1
199 // src_a: [ 0, 3 ]
200 // src_b: [ 1, 4 ]
201 // src_c: [ 2, 5 ]
202 //
203 // zip1(a, b): [ 0, 1 ] -> d0
204 // [ src_c[0], src_a[1] ]: [ 2, 3 ] -> d1
205 // zip2(b, c): [ 4, 5 ] -> d2
206 //
207 // Continuous store of { d0, d1, d2 } gives the expected order.
208 template <>
209 class Merge3<uint64_t> final : public UnrollTwice {
210 public:
211 using ScalarType = uint64_t;
212 using VecTraits = neon::VecTraits<ScalarType>;
213 using VectorType = typename VecTraits::VectorType;
214
215 432 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
216 ScalarType *dst) {
217 432 uint64x2x3_t dst_vect;
218 432 dst_vect.val[0] = vzip1q_u64(src_a, src_b);
219 432 dst_vect.val[1] = src_c;
220 432 dst_vect.val[1][1] = src_a[1];
221 432 dst_vect.val[2] = vzip2q_u64(src_b, src_c);
222
223 432 VecTraits::store(dst_vect, &dst[0]);
224 432 }
225
226 96 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
227 const ScalarType *src_c, ScalarType *dst) {
228 96 dst[0] = src_a[0];
229 96 dst[1] = src_b[0];
230 96 dst[2] = src_c[0];
231 96 }
232 }; // end of class Merge3<uint64_t>
233
234 #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
235
236 // ----------------------------------------
237 // ----------- Four-way merge -------------
238 // ----------------------------------------
239
240 // Generic 4-way merge implementation.
241 //
242 // Algorithm description
243 //
244 // Elements are identified by their intended final position in the output.
245 // The description is for 32-bit elements, but it works just the same way
246 // for smaller element sizes.
247 //
248 // VECTOR / LANE: 0 1 2 3
249 // src_a: [ 0, 4, 8, 12 ]
250 // src_b: [ 1, 5, 9, 13 ]
251 // src_c: [ 2, 6, 10, 14 ]
252 // src_d: [ 3, 7, 11, 15 ]
253 //
254 // zip1_u32(a, b): [ 0, 1, 4, 5 ] -> w
255 // zip1_u32(c, d): [ 2, 3, 6, 7 ] -> x
256 // zip2_u32(a, b): [ 8, 9, 12, 13 ] -> y
257 // zip2_u32(c, d): [ 10, 11, 14, 15 ] -> z
258 //
259 // zip1_u64(w, x): [ 0, 1, 2, 3 ] -> d0
260 // zip2_u64(w, x): [ 4, 5, 6, 7 ] -> d1
261 // zip1_u64(y, z): [ 8, 9, 10, 11 ] -> d2
262 // zip2_u64(y, z): [ 12, 13, 14, 15 ] -> d3
263 //
264 // Continuous store of { d0, d1, d2, d3 } gives the expected order.
265 template <typename ScalarType>
266 class Merge4 final : public UnrollTwice {
267 public:
268 using VecTraits = neon::VecTraits<ScalarType>;
269 using VectorType = typename VecTraits::VectorType;
270 using Vector4Type = typename VecTraits::Vector4Type;
271
272 2448 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
273 VectorType src_d, ScalarType *dst) {
274 #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
275 Vector4Type dst_vect;
276 dst_vect.val[0] = src_a;
277 dst_vect.val[1] = src_b;
278 dst_vect.val[2] = src_c;
279 dst_vect.val[3] = src_d;
280 vst4q(&dst[0], dst_vect);
281 #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
282 2448 auto zip1_a_b = double_width(vzip1q(src_a, src_b));
283 2448 auto zip1_c_d = double_width(vzip1q(src_c, src_d));
284 2448 auto zip2_a_b = double_width(vzip2q(src_a, src_b));
285 2448 auto zip2_c_d = double_width(vzip2q(src_c, src_d));
286
287 // Compilers tend to replace zip instructions with mov, resulting in
288 // longer generated code. Omitting a bitcast appears to help.
289 using DoubleScalarType = double_element_width_t<ScalarType>;
290 2448 typename neon::VecTraits<DoubleScalarType>::Vector4Type dst_vect;
291 2448 dst_vect.val[0] = vzip1q(zip1_a_b, zip1_c_d);
292 2448 dst_vect.val[1] = vzip2q(zip1_a_b, zip1_c_d);
293 2448 dst_vect.val[2] = vzip1q(zip2_a_b, zip2_c_d);
294 2448 dst_vect.val[3] = vzip2q(zip2_a_b, zip2_c_d);
295 2448 neon::VecTraits<DoubleScalarType>::store(
296 2448 dst_vect, reinterpret_cast<DoubleScalarType *>(&dst[0]));
297
298 #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
299 2448 }
300
301 576 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
302 const ScalarType *src_c, const ScalarType *src_d,
303 ScalarType *dst) {
304 576 dst[0] = src_a[0];
305 576 dst[1] = src_b[0];
306 576 dst[2] = src_c[0];
307 576 dst[3] = src_d[0];
308 576 }
309
310 private:
311 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
312 // Polymorphic retinterpret_cast<>() between vector types where the element
313 // size is doubled. For example, if 'VectorType' is 'uint8x16_t', this
314 // method returns 'reinterpret_cast<uint16x8_t>(vector)'.
315 9792 static double_element_width_t<VectorType> double_width(VectorType vector) {
316 9792 return reinterpret_cast<double_element_width_t<VectorType>>(vector);
317 }
318 #endif
319 }; // end of class Merge4<ScalarType>
320
321 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
322
323 // Specialized 4-way merge implementation for 64-bit elements.
324 //
325 // Algorithm description
326 //
327 // Elements are identified by their intended final position in the output.
328 //
329 // VECTOR / LANE: 0 1
330 // src_a: [ 0, 4 ]
331 // src_b: [ 1, 5 ]
332 // src_c: [ 2, 6 ]
333 // src_d: [ 3, 7 ]
334 //
335 // zip1(a, b): [ 0, 1 ] -> d0
336 // zip1(c, d): [ 2, 3 ] -> d1
337 // zip2(a, b): [ 4, 5 ] -> d2
338 // zip2(c, d): [ 6, 7 ] -> d3
339 //
340 // Continuous store of { d0, d1, d2, d3 } gives the expected order.
341 template <>
342 class Merge4<uint64_t> final : public UnrollTwice {
343 public:
344 using ScalarType = uint64_t;
345 using VecTraits = neon::VecTraits<ScalarType>;
346 using VectorType = typename VecTraits::VectorType;
347 using Vector4Type = typename VecTraits::Vector4Type;
348
349 816 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
350 VectorType src_d, ScalarType *dst) {
351 816 Vector4Type dst_vect;
352 816 dst_vect.val[0] = vzip1q(src_a, src_b);
353 816 dst_vect.val[1] = vzip1q(src_c, src_d);
354 816 dst_vect.val[2] = vzip2q(src_a, src_b);
355 816 dst_vect.val[3] = vzip2q(src_c, src_d);
356 816 VecTraits::store(dst_vect, &dst[0]);
357 816 }
358
359 192 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
360 const ScalarType *src_c, const ScalarType *src_d,
361 ScalarType *dst) {
362 192 dst[0] = src_a[0];
363 192 dst[1] = src_b[0];
364 192 dst[2] = src_c[0];
365 192 dst[3] = src_d[0];
366 192 }
367 }; // end of class Merge4<uint64_t>
368
369 #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
370
371 // Most of the complexity comes from parameter checking.
372 // NOLINTBEGIN(readability-function-cognitive-complexity)
373 template <typename ScalarType>
374 1140 kleidicv_error_t merge(const void **srcs, const size_t *src_strides,
375 void *dst_void, size_t dst_stride, size_t width,
376 size_t height, size_t channels) {
377
8/8
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 228 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 300 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 300 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 300 times.
1140 if (channels < 2) {
378 12 return KLEIDICV_ERROR_RANGE;
379 }
380
8/8
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 222 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 294 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 294 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 294 times.
1128 CHECK_POINTERS(srcs, src_strides);
381
6/6
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 285 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 285 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 285 times.
1104 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src0, srcs[0]);
382
6/6
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 276 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 276 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 276 times.
1077 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src1, srcs[1]);
383
6/6
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 267 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 267 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 267 times.
1050 MAKE_POINTER_CHECK_ALIGNMENT(ScalarType, dst, dst_void);
384
16/16
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 213 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 213 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 249 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 249 times.
✓ Branch 8 taken 18 times.
✓ Branch 9 taken 249 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 249 times.
✓ Branch 12 taken 18 times.
✓ Branch 13 taken 249 times.
✓ Branch 14 taken 18 times.
✓ Branch 15 taken 249 times.
1023 CHECK_POINTER_AND_STRIDE(src0, src_strides[0], height);
385
16/16
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 204 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 204 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 231 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 231 times.
✓ Branch 8 taken 18 times.
✓ Branch 9 taken 231 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 231 times.
✓ Branch 12 taken 18 times.
✓ Branch 13 taken 231 times.
✓ Branch 14 taken 18 times.
✓ Branch 15 taken 231 times.
960 CHECK_POINTER_AND_STRIDE(src1, src_strides[1], height);
386
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 201 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 201 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 219 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 219 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 219 times.
✓ Branch 10 taken 12 times.
✓ Branch 11 taken 219 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 219 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 219 times.
897 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
387
24/24
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 198 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 195 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 216 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 213 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 213 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 216 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 213 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 213 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 216 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 213 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 213 times.
858 CHECK_IMAGE_SIZE(width, height);
388
389 834 Rectangle rect{width, height};
390 834 Rows<const ScalarType> src_a_rows{src0, src_strides[0]};
391 834 Rows<const ScalarType> src_b_rows{src1, src_strides[1]};
392 834 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
393
394
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 33 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 105 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 33 times.
✓ Branch 6 taken 60 times.
✓ Branch 7 taken 117 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 33 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 117 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 33 times.
✓ Branch 14 taken 60 times.
✓ Branch 15 taken 117 times.
834 switch (channels) {
395 case 2: {
396 132 Merge2<ScalarType> operation;
397 132 apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
398 dst_rows);
399 132 } break;
400
401 case 3: {
402
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 57 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 57 times.
234 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]);
403
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 51 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 51 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 51 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 51 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 51 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 51 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 51 times.
225 CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height);
404 204 Merge3<ScalarType> operation;
405 204 Rows<const ScalarType> src_c_rows{src2, src_strides[2]};
406 204 apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
407 src_c_rows, dst_rows);
408
8/8
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 51 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 51 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 51 times.
225 } break;
409
410 case 4: {
411
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 114 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 114 times.
456 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]);
412
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 111 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 111 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 111 times.
447 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src3, srcs[3]);
413
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 102 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 102 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 105 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 105 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 105 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 105 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 105 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 105 times.
438 CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height);
414
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 99 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 99 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 99 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 99 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 99 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 99 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 99 times.
417 CHECK_POINTER_AND_STRIDE(src3, src_strides[3], height);
415 396 Merge4<ScalarType> operation;
416 396 Rows<const ScalarType> src_c_rows{src2, src_strides[2]};
417 396 Rows<const ScalarType> src_d_rows{src3, src_strides[3]};
418 396 apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
419 src_c_rows, src_d_rows, dst_rows);
420
8/8
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 15 times.
✓ Branch 3 taken 99 times.
✓ Branch 4 taken 15 times.
✓ Branch 5 taken 99 times.
✓ Branch 6 taken 15 times.
✓ Branch 7 taken 99 times.
447 } break;
421
422 default:
423 12 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
424 }
425 732 return KLEIDICV_OK;
426 1140 }
427 // NOLINTEND(readability-function-cognitive-complexity)
428
429 KLEIDICV_TARGET_FN_ATTRS
430 1143 kleidicv_error_t merge(const void **srcs, const size_t *src_strides, void *dst,
431 size_t dst_stride, size_t width, size_t height,
432 size_t channels, size_t element_size) {
433
5/5
✓ Branch 0 taken 303 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 231 times.
✓ Branch 3 taken 303 times.
✓ Branch 4 taken 303 times.
1143 switch (element_size) {
434 case sizeof(uint8_t):
435 462 return merge<uint8_t>(srcs, src_strides, dst, dst_stride, width, height,
436 231 channels);
437
438 case sizeof(uint16_t):
439 606 return merge<uint16_t>(srcs, src_strides, dst, dst_stride, width, height,
440 303 channels);
441
442 case sizeof(uint32_t):
443 606 return merge<uint32_t>(srcs, src_strides, dst, dst_stride, width, height,
444 303 channels);
445
446 case sizeof(uint64_t):
447 606 return merge<uint64_t>(srcs, src_strides, dst, dst_stride, width, height,
448 303 channels);
449
450 default:
451 3 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
452 }
453 1143 }
454
455 } // namespace kleidicv::neon
456