KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/merge_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 148 148 100.0%
Functions: 36 36 100.0%
Branches: 209 209 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/conversions/merge.h"
6 #include "kleidicv/kleidicv.h"
7 #include "kleidicv/neon.h"
8
9 namespace kleidicv::neon {
10
11 // ----------------------------------------
12 // ------------ Two-way merge -------------
13 // ----------------------------------------
14
15 // Generic 2-way merge implementation.
16 //
17 // Algorithm description
18 //
19 // Elements are identified by their intended final position in the output.
20 // The description is for 32-bit elements, but it works just the same way
21 // for different element sizes.
22 //
23 // VECTOR / LANE: 0 1 2 3
24 // src_a: [ 0, 2, 4, 6 ]
25 // src_b: [ 1, 3, 5, 7 ]
26 //
27 // zip1(a, b): [ 0, 1, 2, 3 ] -> d0
28 // zip2(a, b): [ 4, 5, 6, 7 ] -> d1
29 //
30 // Continuous store of { d0, d1 } gives the expected order.
31 template <typename ScalarType>
32 class Merge2 final : public UnrollTwice {
33 public:
34 using VecTraits = neon::VecTraits<ScalarType>;
35 using VectorType = typename VecTraits::VectorType;
36 using Vector2Type = typename VecTraits::Vector2Type;
37
38 1600 void vector_path(VectorType src_a, VectorType src_b, ScalarType *dst) {
39 #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
40 Vector2Type dst_vect;
41 dst_vect.val[0] = src_a;
42 dst_vect.val[1] = src_b;
43 vst2q(&dst[0], dst_vect);
44 #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
45 1600 Vector2Type dst_vect;
46 1600 dst_vect.val[0] = vzip1q(src_a, src_b);
47 1600 dst_vect.val[1] = vzip2q(src_a, src_b);
48 1600 VecTraits::store(dst_vect, &dst[0]);
49
50 #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
51 1600 }
52
53 256 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
54 ScalarType *dst) {
55 256 dst[0] = src_a[0];
56 256 dst[1] = src_b[0];
57 256 }
58 }; // end of class Merge2<ScalarType>
59
60 // ----------------------------------------
61 // ---------- Three-way merge -------------
62 // ----------------------------------------
63
64 template <typename ScalarType>
65 class Merge3 final : public UnrollTwice {
66 public:
67 using VecTraits = neon::VecTraits<ScalarType>;
68 using VectorType = typename VecTraits::VectorType;
69 using Vector3Type = typename VecTraits::Vector3Type;
70
71 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
72
73 136 Merge3() : table_indices_{} {
74 136 neon::VecTraits<uint8_t>::load(lookup_table(ScalarType()), table_indices_);
75 136 }
76
77 #endif
78
79 1440 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
80 ScalarType *dst) {
81 1440 Vector3Type dst_vect;
82 #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
83 dst_vect.val[0] = src_a;
84 dst_vect.val[1] = src_b;
85 dst_vect.val[2] = src_c;
86 vst3q(&dst[0], dst_vect);
87 #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
88 1440 uint8x16x3_t src_vect;
89 1440 src_vect.val[0] = vreinterpretq_u8(src_a);
90 1440 src_vect.val[1] = vreinterpretq_u8(src_b);
91 1440 src_vect.val[2] = vreinterpretq_u8(src_c);
92 1440 dst_vect.val[0] = vqtbl3q_u8(src_vect, table_indices_.val[0]);
93 1440 dst_vect.val[1] = vqtbl3q_u8(src_vect, table_indices_.val[1]);
94 1440 dst_vect.val[2] = vqtbl3q_u8(src_vect, table_indices_.val[2]);
95 1440 VecTraits::store(dst_vect, &dst[0]);
96 #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
97 1440 }
98
99 256 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
100 const ScalarType *src_c, ScalarType *dst) {
101 256 dst[0] = src_a[0];
102 256 dst[1] = src_b[0];
103 256 dst[2] = src_c[0];
104 256 }
105
106 private:
107 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
108 68 static uint8_t *lookup_table(uint8_t) {
109 // clang-format off
110 static uint8_t kIndices[48] = {
111 0, 16, 32, 1, 17, 33, 2, 18, 34, 3, 19, 35, 4, 20, 36, 5,
112 21, 37, 6, 22, 38, 7, 23, 39, 8, 24, 40, 9, 25, 41, 10, 26,
113 42, 11, 27, 43, 12, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47,
114 };
115 68 return &kIndices[0];
116 }
117
118 // Lookup table for 16-bit inputs.
119 68 static uint8_t *lookup_table(uint16_t) {
120 // clang-format off
121 static uint8_t kIndices[48] = {
122 0, 1, 16, 17, 32, 33, 2, 3, 18, 19, 34, 35, 4, 5, 20, 21,
123 36, 37, 6, 7, 22, 23, 38, 39, 8, 9, 24, 25, 40, 41, 10, 11,
124 26, 27, 42, 43, 12, 13, 28, 29, 44, 45, 14, 15, 30, 31, 46, 47,
125 };
126 // clang-format on
127 68 return &kIndices[0];
128 }
129
130 uint8x16x3_t table_indices_;
131 #endif
132 }; // end of class Merge3<ScalarType>
133
134 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
135
136 // Specialized 3-way merge implementation for 32-bit elements.
137 //
138 // Algorithm description
139 //
140 // Elements are identified by their intended final position in the output.
141 //
142 // VECTOR / LANE: 0 1 2 3
143 // src_a: [ 0, 3, 6, 9 ]
144 // src_b: [ 1, 4, 7, 10 ]
145 // src_c: [ 2, 5, 8, 11 ]
146 //
147 // trn2(a, b): [ 3, 4, 9, 10 ] -> w
148 // trn1(c, w): [ 2, 3, 8, 9 ] -> x
149 // trn2(w, c): [ 4, 5, 10, 11 ] -> y
150 // trn1(a, b): [ 0, 1, 6, 7 ] -> z
151 //
152 // zip1_u64(z, x): [ 0, 1, 2, 3 ] -> d0
153 // [ y_u64[0], z_u64[1] ]: [ 4, 5, 6, 7 ] -> d1
154 // zip2_u64(x, y): [ 8, 9, 10, 11 ] -> d2
155 //
156 // Continuous store of { d0, d1, d2 } gives the expected order.
157 template <>
158 class Merge3<uint32_t> final : public UnrollTwice {
159 public:
160 using ScalarType = uint32_t;
161 using VecTraits = neon::VecTraits<ScalarType>;
162 using VectorType = typename VecTraits::VectorType;
163 using Vector3Type = typename VecTraits::Vector3Type;
164
165 720 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
166 ScalarType *dst) {
167 720 uint32x4_t w = vtrn2q_u32(src_a, src_b);
168 720 uint32x4_t x = vtrn1q_u32(src_c, w);
169 720 uint32x4_t y = vtrn2q_u32(w, src_c);
170 720 uint32x4_t z = vtrn1q_u32(src_a, src_b);
171
172 720 uint32x4_t dst_vect_0 = vzip1q_u64(z, x);
173 720 uint64x2_t dst_vect_1 = y;
174 720 dst_vect_1[1] = vreinterpretq_u64_u32(z)[1];
175 720 uint32x4_t dst_vect_2 = vzip2q_u64(x, y);
176
177 // Not using vst1q_u32_x3, becuse the requirement on continuous vector
178 // register allocation may result in longer code.
179 720 vst1q_u32(&dst[0 * VecTraits::num_lanes()], dst_vect_0);
180 720 vst1q_u32(&dst[1 * VecTraits::num_lanes()], dst_vect_1);
181 720 vst1q_u32(&dst[2 * VecTraits::num_lanes()], dst_vect_2);
182 720 }
183
184 128 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
185 const ScalarType *src_c, ScalarType *dst) {
186 128 dst[0] = src_a[0];
187 128 dst[1] = src_b[0];
188 128 dst[2] = src_c[0];
189 128 }
190 }; // end of class Merge3<uint32_t>
191
192 // Specialized 3-way merge implementation for 64-bit elements.
193 //
194 // Algorithm description
195 //
196 // Elements are identified by their intended final position in the output.
197 //
198 // VECTOR / LANE: 0 1
199 // src_a: [ 0, 3 ]
200 // src_b: [ 1, 4 ]
201 // src_c: [ 2, 5 ]
202 //
203 // zip1(a, b): [ 0, 1 ] -> d0
204 // [ src_c[0], src_a[1] ]: [ 2, 3 ] -> d1
205 // zip2(b, c): [ 4, 5 ] -> d2
206 //
207 // Continuous store of { d0, d1, d2 } gives the expected order.
208 template <>
209 class Merge3<uint64_t> final : public UnrollTwice {
210 public:
211 using ScalarType = uint64_t;
212 using VecTraits = neon::VecTraits<ScalarType>;
213 using VectorType = typename VecTraits::VectorType;
214
215 720 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
216 ScalarType *dst) {
217 720 uint64x2x3_t dst_vect;
218 720 dst_vect.val[0] = vzip1q_u64(src_a, src_b);
219 720 dst_vect.val[1] = src_c;
220 720 dst_vect.val[1][1] = src_a[1];
221 720 dst_vect.val[2] = vzip2q_u64(src_b, src_c);
222
223 720 VecTraits::store(dst_vect, &dst[0]);
224 720 }
225
226 128 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
227 const ScalarType *src_c, ScalarType *dst) {
228 128 dst[0] = src_a[0];
229 128 dst[1] = src_b[0];
230 128 dst[2] = src_c[0];
231 128 }
232 }; // end of class Merge3<uint64_t>
233
234 #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
235
236 // ----------------------------------------
237 // ----------- Four-way merge -------------
238 // ----------------------------------------
239
240 // Generic 4-way merge implementation.
241 //
242 // Algorithm description
243 //
244 // Elements are identified by their intended final position in the output.
245 // The description is for 32-bit elements, but it works just the same way
246 // for smaller element sizes.
247 //
248 // VECTOR / LANE: 0 1 2 3
249 // src_a: [ 0, 4, 8, 12 ]
250 // src_b: [ 1, 5, 9, 13 ]
251 // src_c: [ 2, 6, 10, 14 ]
252 // src_d: [ 3, 7, 11, 15 ]
253 //
254 // zip1_u32(a, b): [ 0, 1, 4, 5 ] -> w
255 // zip1_u32(c, d): [ 2, 3, 6, 7 ] -> x
256 // zip2_u32(a, b): [ 8, 9, 12, 13 ] -> y
257 // zip2_u32(c, d): [ 10, 11, 14, 15 ] -> z
258 //
259 // zip1_u64(w, x): [ 0, 1, 2, 3 ] -> d0
260 // zip2_u64(w, x): [ 4, 5, 6, 7 ] -> d1
261 // zip1_u64(y, z): [ 8, 9, 10, 11 ] -> d2
262 // zip2_u64(y, z): [ 12, 13, 14, 15 ] -> d3
263 //
264 // Continuous store of { d0, d1, d2, d3 } gives the expected order.
265 template <typename ScalarType>
266 class Merge4 final : public UnrollTwice {
267 public:
268 using VecTraits = neon::VecTraits<ScalarType>;
269 using VectorType = typename VecTraits::VectorType;
270 using Vector4Type = typename VecTraits::Vector4Type;
271
272 4080 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
273 VectorType src_d, ScalarType *dst) {
274 #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
275 Vector4Type dst_vect;
276 dst_vect.val[0] = src_a;
277 dst_vect.val[1] = src_b;
278 dst_vect.val[2] = src_c;
279 dst_vect.val[3] = src_d;
280 vst4q(&dst[0], dst_vect);
281 #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
282 4080 auto zip1_a_b = double_width(vzip1q(src_a, src_b));
283 4080 auto zip1_c_d = double_width(vzip1q(src_c, src_d));
284 4080 auto zip2_a_b = double_width(vzip2q(src_a, src_b));
285 4080 auto zip2_c_d = double_width(vzip2q(src_c, src_d));
286
287 // Compilers tend to replace zip instructions with mov, resulting in
288 // longer generated code. Omitting a bitcast appears to help.
289 using DoubleScalarType = double_element_width_t<ScalarType>;
290 4080 typename neon::VecTraits<DoubleScalarType>::Vector4Type dst_vect;
291 4080 dst_vect.val[0] = vzip1q(zip1_a_b, zip1_c_d);
292 4080 dst_vect.val[1] = vzip2q(zip1_a_b, zip1_c_d);
293 4080 dst_vect.val[2] = vzip1q(zip2_a_b, zip2_c_d);
294 4080 dst_vect.val[3] = vzip2q(zip2_a_b, zip2_c_d);
295 4080 neon::VecTraits<DoubleScalarType>::store(
296 4080 dst_vect, reinterpret_cast<DoubleScalarType *>(&dst[0]));
297
298 #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
299 4080 }
300
301 768 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
302 const ScalarType *src_c, const ScalarType *src_d,
303 ScalarType *dst) {
304 768 dst[0] = src_a[0];
305 768 dst[1] = src_b[0];
306 768 dst[2] = src_c[0];
307 768 dst[3] = src_d[0];
308 768 }
309
310 private:
311 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
312 // Polymorphic retinterpret_cast<>() between vector types where the element
313 // size is doubled. For example, if 'VectorType' is 'uint8x16_t', this
314 // method returns 'reinterpret_cast<uint16x8_t>(vector)'.
315 16320 static double_element_width_t<VectorType> double_width(VectorType vector) {
316 16320 return reinterpret_cast<double_element_width_t<VectorType>>(vector);
317 }
318 #endif
319 }; // end of class Merge4<ScalarType>
320
321 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
322
323 // Specialized 4-way merge implementation for 64-bit elements.
324 //
325 // Algorithm description
326 //
327 // Elements are identified by their intended final position in the output.
328 //
329 // VECTOR / LANE: 0 1
330 // src_a: [ 0, 4 ]
331 // src_b: [ 1, 5 ]
332 // src_c: [ 2, 6 ]
333 // src_d: [ 3, 7 ]
334 //
335 // zip1(a, b): [ 0, 1 ] -> d0
336 // zip1(c, d): [ 2, 3 ] -> d1
337 // zip2(a, b): [ 4, 5 ] -> d2
338 // zip2(c, d): [ 6, 7 ] -> d3
339 //
340 // Continuous store of { d0, d1, d2, d3 } gives the expected order.
341 template <>
342 class Merge4<uint64_t> final : public UnrollTwice {
343 public:
344 using ScalarType = uint64_t;
345 using VecTraits = neon::VecTraits<ScalarType>;
346 using VectorType = typename VecTraits::VectorType;
347 using Vector4Type = typename VecTraits::Vector4Type;
348
349 1360 void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
350 VectorType src_d, ScalarType *dst) {
351 1360 Vector4Type dst_vect;
352 1360 dst_vect.val[0] = vzip1q(src_a, src_b);
353 1360 dst_vect.val[1] = vzip1q(src_c, src_d);
354 1360 dst_vect.val[2] = vzip2q(src_a, src_b);
355 1360 dst_vect.val[3] = vzip2q(src_c, src_d);
356 1360 VecTraits::store(dst_vect, &dst[0]);
357 1360 }
358
359 256 void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
360 const ScalarType *src_c, const ScalarType *src_d,
361 ScalarType *dst) {
362 256 dst[0] = src_a[0];
363 256 dst[1] = src_b[0];
364 256 dst[2] = src_c[0];
365 256 dst[3] = src_d[0];
366 256 }
367 }; // end of class Merge4<uint64_t>
368
369 #endif // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
370
371 // Most of the complexity comes from parameter checking.
372 // NOLINTBEGIN(readability-function-cognitive-complexity)
373 template <typename ScalarType>
374 1520 kleidicv_error_t merge(const void **srcs, const size_t *src_strides,
375 void *dst_void, size_t dst_stride, size_t width,
376 size_t height, size_t channels) {
377
8/8
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 304 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 400 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 400 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 400 times.
1520 if (channels < 2) {
378 16 return KLEIDICV_ERROR_RANGE;
379 }
380
8/8
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 296 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 392 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 392 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 392 times.
1504 CHECK_POINTERS(srcs, src_strides);
381
6/6
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 380 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 380 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 380 times.
1472 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src0, srcs[0]);
382
6/6
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 368 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 368 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 368 times.
1436 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src1, srcs[1]);
383
6/6
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 356 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 356 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 356 times.
1400 MAKE_POINTER_CHECK_ALIGNMENT(ScalarType, dst, dst_void);
384
16/16
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 284 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 332 times.
✓ Branch 6 taken 24 times.
✓ Branch 7 taken 332 times.
✓ Branch 8 taken 24 times.
✓ Branch 9 taken 332 times.
✓ Branch 10 taken 24 times.
✓ Branch 11 taken 332 times.
✓ Branch 12 taken 24 times.
✓ Branch 13 taken 332 times.
✓ Branch 14 taken 24 times.
✓ Branch 15 taken 332 times.
1364 CHECK_POINTER_AND_STRIDE(src0, src_strides[0], height);
385
16/16
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 272 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 272 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 308 times.
✓ Branch 6 taken 24 times.
✓ Branch 7 taken 308 times.
✓ Branch 8 taken 24 times.
✓ Branch 9 taken 308 times.
✓ Branch 10 taken 24 times.
✓ Branch 11 taken 308 times.
✓ Branch 12 taken 24 times.
✓ Branch 13 taken 308 times.
✓ Branch 14 taken 24 times.
✓ Branch 15 taken 308 times.
1280 CHECK_POINTER_AND_STRIDE(src1, src_strides[1], height);
386
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 268 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 268 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 292 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 292 times.
✓ Branch 8 taken 16 times.
✓ Branch 9 taken 292 times.
✓ Branch 10 taken 16 times.
✓ Branch 11 taken 292 times.
✓ Branch 12 taken 16 times.
✓ Branch 13 taken 292 times.
✓ Branch 14 taken 16 times.
✓ Branch 15 taken 292 times.
1196 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
387
24/24
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 260 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 260 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 288 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 284 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 284 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 288 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 284 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 284 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 288 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 284 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 284 times.
1144 CHECK_IMAGE_SIZE(width, height);
388
389 1112 Rectangle rect{width, height};
390 1112 Rows<const ScalarType> src_a_rows{src0, src_strides[0]};
391 1112 Rows<const ScalarType> src_b_rows{src1, src_strides[1]};
392 1112 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
393
394
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 44 times.
✓ Branch 2 taken 72 times.
✓ Branch 3 taken 140 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 44 times.
✓ Branch 6 taken 80 times.
✓ Branch 7 taken 156 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 44 times.
✓ Branch 10 taken 80 times.
✓ Branch 11 taken 156 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 44 times.
✓ Branch 14 taken 80 times.
✓ Branch 15 taken 156 times.
1112 switch (channels) {
395 case 2: {
396 176 Merge2<ScalarType> operation;
397 176 apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
398 dst_rows);
399 176 } break;
400
401 case 3: {
402
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 76 times.
312 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]);
403
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 68 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 68 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 68 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 68 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 68 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 68 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 68 times.
300 CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height);
404 272 Merge3<ScalarType> operation;
405 272 Rows<const ScalarType> src_c_rows{src2, src_strides[2]};
406 272 apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
407 src_c_rows, dst_rows);
408
8/8
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 68 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 68 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 68 times.
300 } break;
409
410 case 4: {
411
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 152 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 152 times.
608 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]);
412
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 148 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 148 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 148 times.
596 MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src3, srcs[3]);
413
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 136 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 136 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 140 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 140 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 140 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 140 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 140 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 140 times.
584 CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height);
414
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 132 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 132 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 132 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 132 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 132 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 132 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 132 times.
556 CHECK_POINTER_AND_STRIDE(src3, src_strides[3], height);
415 528 Merge4<ScalarType> operation;
416 528 Rows<const ScalarType> src_c_rows{src2, src_strides[2]};
417 528 Rows<const ScalarType> src_d_rows{src3, src_strides[3]};
418 528 apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
419 src_c_rows, src_d_rows, dst_rows);
420
8/8
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 132 times.
✓ Branch 4 taken 20 times.
✓ Branch 5 taken 132 times.
✓ Branch 6 taken 20 times.
✓ Branch 7 taken 132 times.
596 } break;
421
422 default:
423 16 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
424 }
425 976 return KLEIDICV_OK;
426 1520 }
427 // NOLINTEND(readability-function-cognitive-complexity)
428
429 KLEIDICV_TARGET_FN_ATTRS
430 1524 kleidicv_error_t merge(const void **srcs, const size_t *src_strides, void *dst,
431 size_t dst_stride, size_t width, size_t height,
432 size_t channels, size_t element_size) {
433
5/5
✓ Branch 0 taken 404 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 308 times.
✓ Branch 3 taken 404 times.
✓ Branch 4 taken 404 times.
1524 switch (element_size) {
434 case sizeof(uint8_t):
435 616 return merge<uint8_t>(srcs, src_strides, dst, dst_stride, width, height,
436 308 channels);
437
438 case sizeof(uint16_t):
439 808 return merge<uint16_t>(srcs, src_strides, dst, dst_stride, width, height,
440 404 channels);
441
442 case sizeof(uint32_t):
443 808 return merge<uint32_t>(srcs, src_strides, dst, dst_stride, width, height,
444 404 channels);
445
446 case sizeof(uint64_t):
447 808 return merge<uint64_t>(srcs, src_strides, dst, dst_stride, width, height,
448 404 channels);
449
450 default:
451 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
452 }
453 1524 }
454
455 } // namespace kleidicv::neon
456