KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/arithmetics/transpose_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 136 136 100.0%
Functions: 63 63 100.0%
Branches: 184 185 99.5%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/arithmetics/transpose.h"
6 #include "kleidicv/kleidicv.h"
7 #include "kleidicv/neon.h"
8
9 namespace kleidicv::neon {
10
11 template <const size_t BufferSize, const size_t Order, typename DstVectorType,
12 typename SrcType>
13 27200 static void transpose_vectors_recursively(DstVectorType *dst_vectors,
14 Rows<const SrcType> src_rows) {
15 // order is halved at every recursive call, once it is 2 the recursion should
16 // stop and the input data needs to be read.
17 if constexpr (Order == 2) {
18 KLEIDICV_FORCE_LOOP_UNROLL
19
8/8
✓ Branch 0 taken 2720 times.
✓ Branch 1 taken 21760 times.
✓ Branch 2 taken 2720 times.
✓ Branch 3 taken 10880 times.
✓ Branch 4 taken 2720 times.
✓ Branch 5 taken 5440 times.
✓ Branch 6 taken 2720 times.
✓ Branch 7 taken 2720 times.
51680 for (size_t index = 0; index < BufferSize; index += Order) {
20 using SrcVectorType = typename VecTraits<SrcType>::VectorType;
21 40800 SrcVectorType src_vector[2];
22
23 40800 src_vector[0] = vld1q(&src_rows.at(index + 0)[0]);
24 40800 src_vector[1] = vld1q(&src_rows.at(index + 1)[0]);
25
26 // If order is 2 than SrcVectorType is the same as DstVectorType
27 40800 dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]);
28 40800 dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]);
29 40800 }
30 } else {
31 // First we need to create the input for the current transpose stage, which
32 // is the output of the previous stage. The previous stage transposes
33 // elements half the size of the current stage and its order is also half of
34 // the current one.
35 16320 half_element_width_t<DstVectorType> input[BufferSize];
36 16320 constexpr size_t previous_order = Order / 2;
37
38 16320 transpose_vectors_recursively<BufferSize, previous_order>(input, src_rows);
39
40 16320 constexpr size_t half_order = Order / 2;
41
42 KLEIDICV_FORCE_LOOP_UNROLL
43
12/12
✓ Branch 0 taken 2720 times.
✓ Branch 1 taken 2720 times.
✓ Branch 2 taken 2720 times.
✓ Branch 3 taken 5440 times.
✓ Branch 4 taken 2720 times.
✓ Branch 5 taken 10880 times.
✓ Branch 6 taken 2720 times.
✓ Branch 7 taken 2720 times.
✓ Branch 8 taken 2720 times.
✓ Branch 9 taken 5440 times.
✓ Branch 10 taken 2720 times.
✓ Branch 11 taken 2720 times.
46240 for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) {
44 KLEIDICV_FORCE_LOOP_UNROLL
45
12/12
✓ Branch 0 taken 21760 times.
✓ Branch 1 taken 2720 times.
✓ Branch 2 taken 21760 times.
✓ Branch 3 taken 5440 times.
✓ Branch 4 taken 21760 times.
✓ Branch 5 taken 10880 times.
✓ Branch 6 taken 10880 times.
✓ Branch 7 taken 2720 times.
✓ Branch 8 taken 10880 times.
✓ Branch 9 taken 5440 times.
✓ Branch 10 taken 5440 times.
✓ Branch 11 taken 2720 times.
122400 for (size_t inner_i = 0; inner_i < half_order; ++inner_i) {
46 92480 dst_vectors[outer_i + inner_i] =
47 184960 vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
48 92480 reinterpret_cast<DstVectorType>(
49 92480 input[outer_i + inner_i + half_order]));
50
51 92480 dst_vectors[outer_i + half_order + inner_i] =
52 184960 vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
53 92480 reinterpret_cast<DstVectorType>(
54 92480 input[outer_i + inner_i + half_order]));
55 92480 }
56 29920 }
57 16320 }
58 27200 }
59
60 // Transposes one tile of data with vector instructions. The tile's width and
61 // height are the number of NEON lanes for the given type.
62 template <typename ScalarType>
63 10880 static void vector_path(Rows<const ScalarType> src_rows,
64 Rows<ScalarType> dst_rows) {
65 10880 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
66
67 // The number of vectors read and write is the same as the lane count of the
68 // given element size
69 10880 constexpr size_t buffer_size = num_of_lanes;
70
71 // Last transpose step is always done on 64 bit elements
72 10880 uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays)
73
74 // The 64 bit transpose spans through all the vectors, so its "order" is the
75 // same as the number of vectors
76 10880 constexpr size_t transpose_order_b64 = num_of_lanes;
77
78 10880 transpose_vectors_recursively<buffer_size, transpose_order_b64>(
79 10880 trn_result_b64, src_rows);
80
81 KLEIDICV_FORCE_LOOP_UNROLL
82
8/8
✓ Branch 0 taken 2720 times.
✓ Branch 1 taken 43520 times.
✓ Branch 2 taken 2720 times.
✓ Branch 3 taken 21760 times.
✓ Branch 4 taken 2720 times.
✓ Branch 5 taken 10880 times.
✓ Branch 6 taken 2720 times.
✓ Branch 7 taken 5440 times.
92480 for (size_t index = 0; index < buffer_size; ++index) {
83 81600 vst1q(&dst_rows.at(index)[0], trn_result_b64[index]);
84 81600 }
85 10880 }
86
87 template <typename ScalarType>
88 76 static void scalar_path(Rows<const ScalarType> src_rows,
89 Rows<ScalarType> dst_rows, size_t height,
90 size_t width) {
91
8/8
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 100 times.
✓ Branch 4 taken 20 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 20 times.
✓ Branch 7 taken 28 times.
448 for (size_t vindex = 0; vindex < height; ++vindex) {
92 372 disable_loop_vectorization();
93
8/8
✓ Branch 0 taken 2940 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 704 times.
✓ Branch 3 taken 100 times.
✓ Branch 4 taken 160 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 32 times.
✓ Branch 7 taken 28 times.
4208 for (size_t hindex = 0; hindex < width; ++hindex) {
94 3836 disable_loop_vectorization();
95 3836 dst_rows.at(hindex)[vindex] = src_rows.at(vindex)[hindex];
96 3836 }
97 372 }
98 76 }
99
100 template <typename ScalarType>
101 204 static kleidicv_error_t transpose(Rectangle rect,
102 Rows<const ScalarType> src_rows,
103 Rows<ScalarType> dst_rows) {
104 204 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
105
106 1132 auto handle_lane_number_of_rows = [&](size_t vindex) {
107 928 LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes);
108
109 8576 horizontal_loop.unroll_once([&](size_t hindex) {
110 // if the input is big enough handle it tile by tile
111 15296 vector_path<ScalarType>(src_rows.at(vindex, hindex),
112 7648 dst_rows.at(hindex, vindex));
113 7648 });
114
115 960 horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) {
116 64 scalar_path(src_rows.at(vindex, hindex), dst_rows.at(hindex, vindex),
117 32 num_of_lanes, final_hindex - hindex);
118 32 });
119 928 };
120
121 204 LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes);
122
123 204 vertical_loop.unroll_once(handle_lane_number_of_rows);
124
125 248 vertical_loop.remaining([&](size_t hindex, size_t final_hindex) {
126 88 scalar_path(src_rows.at(hindex), dst_rows.at(0, hindex),
127 44 final_hindex - hindex, rect.width());
128 44 });
129 204 return KLEIDICV_OK;
130 204 }
131
132 template <typename ScalarType>
133 96 static kleidicv_error_t transpose(Rectangle rect, Rows<ScalarType> data_rows) {
134 96 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
135
136 // rect.width() needs to be equal to rect.height()
137 96 LoopUnroll2 outer_loop(rect.width(), num_of_lanes);
138
139 512 outer_loop.unroll_once([&](size_t vindex) {
140 // Handle tiles on the diagonal line
141 832 vector_path<ScalarType>(data_rows.at(vindex, vindex),
142 416 data_rows.at(vindex, vindex));
143
144 // Handle the top right half
145
8/8
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 96 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 96 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 96 times.
416 if (rect.width() > (vindex + num_of_lanes)) {
146 // Indexes are running through only the top right half
147 384 LoopUnroll2 inner_loop(vindex + num_of_lanes, rect.width(), num_of_lanes);
148
149 1792 inner_loop.unroll_once([&](size_t hindex) {
150 // Allocate temporary memory for one tile
151 1408 ScalarType tmp[num_of_lanes * num_of_lanes]; // NOLINT(runtime/arrays)
152 1408 Rows<ScalarType> tmp_rows{tmp, num_of_lanes * sizeof(ScalarType)};
153
154 // Transpose a tile from the top right area, save the result
155 // into temporary memory
156 1408 vector_path<ScalarType>(data_rows.at(vindex, hindex), tmp_rows);
157 // Transpose its mirror tile from the left bottom area, save the
158 // result to its final space
159 2816 vector_path<ScalarType>(data_rows.at(hindex, vindex),
160 1408 data_rows.at(vindex, hindex));
161 // Copy the temprory result to its final destination
162 2816 Rows<const ScalarType> const_tmp_rows{
163 1408 tmp, num_of_lanes * sizeof(ScalarType)};
164 1408 CopyNonOverlappingRows<ScalarType>::copy_rows(
165 1408 Rectangle{num_of_lanes, num_of_lanes}, const_tmp_rows,
166 1408 data_rows.at(hindex, vindex));
167 1408 });
168
169 640 inner_loop.remaining([&](size_t hindex, size_t final_hindex) {
170 // As this is the unroll_once path of the outer_loop there is
171 // num_of_lanes worth of data in the vertical direction
172
8/8
✓ Branch 0 taken 64 times.
✓ Branch 1 taken 1024 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 512 times.
✓ Branch 4 taken 64 times.
✓ Branch 5 taken 256 times.
✓ Branch 6 taken 64 times.
✓ Branch 7 taken 128 times.
2176 for (size_t i = vindex; i < vindex + num_of_lanes; ++i) {
173 1920 disable_loop_vectorization();
174
8/8
✓ Branch 0 taken 15360 times.
✓ Branch 1 taken 1024 times.
✓ Branch 2 taken 3584 times.
✓ Branch 3 taken 512 times.
✓ Branch 4 taken 768 times.
✓ Branch 5 taken 256 times.
✓ Branch 6 taken 128 times.
✓ Branch 7 taken 128 times.
21760 for (size_t j = hindex; j < final_hindex; ++j) {
175 19840 disable_loop_vectorization();
176 19840 std::swap(data_rows.at(i)[j], data_rows.at(j)[i]);
177 19840 }
178 1920 }
179 256 });
180 384 }
181 416 });
182
183 160 outer_loop.remaining([&](size_t vindex, size_t final_vindex) {
184
8/8
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 240 times.
✓ Branch 2 taken 16 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 16 times.
480 for (size_t i = vindex; i < final_vindex; ++i) {
185 416 disable_loop_vectorization();
186 // Only the top right half pixels need to be indexed
187
7/8
✓ Branch 0 taken 1680 times.
✓ Branch 1 taken 240 times.
✓ Branch 2 taken 336 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 48 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 16 times.
2480 for (size_t j = i + 1; j < final_vindex; ++j) {
188 2064 disable_loop_vectorization();
189 2064 std::swap(data_rows.at(i)[j], data_rows.at(j)[i]);
190 2064 }
191 416 }
192 64 });
193 96 return KLEIDICV_OK;
194 96 }
195
196 template <typename T>
197 428 static kleidicv_error_t transpose(const void *src_void, size_t src_stride,
198 void *dst_void, size_t dst_stride,
199 size_t src_width, size_t src_height) {
200
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 108 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 108 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 108 times.
428 MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void);
201
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 104 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 104 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 104 times.
416 MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void);
202
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 88 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 88 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 96 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 96 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 96 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 96 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 96 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 96 times.
404 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
203
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 84 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 88 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 88 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 88 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 88 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 88 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 88 times.
376 CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width);
204
24/24
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 76 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 84 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 80 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 80 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 84 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 80 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 80 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 84 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 80 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 80 times.
348 CHECK_IMAGE_SIZE(src_width, src_height);
205
206 316 Rectangle rect{src_width, src_height};
207 316 Rows<T> dst_rows{dst, dst_stride};
208
209
8/8
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 28 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 28 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 28 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 28 times.
316 if (src == dst) {
210
8/8
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 24 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 24 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 24 times.
112 if (src_width != src_height) {
211 // Inplace transpose only implemented if width and height are the same
212 16 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
213 }
214 96 return transpose(rect, dst_rows);
215 }
216 204 Rows<const T> src_rows{src, src_stride};
217 204 return transpose(rect, src_rows, dst_rows);
218 428 }
219
220 KLEIDICV_TARGET_FN_ATTRS
221 432 kleidicv_error_t transpose(const void *src, size_t src_stride, void *dst,
222 size_t dst_stride, size_t src_width,
223 size_t src_height, size_t element_size) {
224
5/5
✓ Branch 0 taken 112 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 92 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 112 times.
432 switch (element_size) {
225 case sizeof(uint8_t):
226 184 return transpose<uint8_t>(src, src_stride, dst, dst_stride, src_width,
227 92 src_height);
228 case sizeof(uint16_t):
229 224 return transpose<uint16_t>(src, src_stride, dst, dst_stride, src_width,
230 112 src_height);
231 case sizeof(uint32_t):
232 224 return transpose<uint32_t>(src, src_stride, dst, dst_stride, src_width,
233 112 src_height);
234 case sizeof(uint64_t):
235 224 return transpose<uint64_t>(src, src_stride, dst, dst_stride, src_width,
236 112 src_height);
237 default:
238 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
239 }
240 432 }
241
242 } // namespace kleidicv::neon
243