KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/arithmetics/transpose_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 136 136 100.0%
Functions: 63 63 100.0%
Branches: 184 185 99.5%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/arithmetics/transpose.h"
6 #include "kleidicv/kleidicv.h"
7 #include "kleidicv/neon.h"
8
9 namespace kleidicv::neon {
10
11 template <const size_t BufferSize, const size_t Order, typename DstVectorType,
12 typename SrcType>
13 14360 static void transpose_vectors_recursively(DstVectorType *dst_vectors,
14 Rows<const SrcType> src_rows) {
15 // order is halved at every recursive call, once it is 2 the recursion should
16 // stop and the input data needs to be read.
17 if constexpr (Order == 2) {
18 KLEIDICV_FORCE_LOOP_UNROLL
19
8/8
✓ Branch 0 taken 1436 times.
✓ Branch 1 taken 11488 times.
✓ Branch 2 taken 1436 times.
✓ Branch 3 taken 5744 times.
✓ Branch 4 taken 1436 times.
✓ Branch 5 taken 2872 times.
✓ Branch 6 taken 1436 times.
✓ Branch 7 taken 1436 times.
27284 for (size_t index = 0; index < BufferSize; index += Order) {
20 using SrcVectorType = typename VecTraits<SrcType>::VectorType;
21 21540 SrcVectorType src_vector[2];
22
23 21540 src_vector[0] = vld1q(&src_rows.at(index + 0)[0]);
24 21540 src_vector[1] = vld1q(&src_rows.at(index + 1)[0]);
25
26 // If order is 2 than SrcVectorType is the same as DstVectorType
27 21540 dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]);
28 21540 dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]);
29 21540 }
30 } else {
31 // First we need to create the input for the current transpose stage, which
32 // is the output of the previous stage. The previous stage transposes
33 // elements half the size of the current stage and its order is also half of
34 // the current one.
35 8616 half_element_width_t<DstVectorType> input[BufferSize];
36 8616 constexpr size_t previous_order = Order / 2;
37
38 8616 transpose_vectors_recursively<BufferSize, previous_order>(input, src_rows);
39
40 8616 constexpr size_t half_order = Order / 2;
41
42 KLEIDICV_FORCE_LOOP_UNROLL
43
12/12
✓ Branch 0 taken 1436 times.
✓ Branch 1 taken 1436 times.
✓ Branch 2 taken 1436 times.
✓ Branch 3 taken 2872 times.
✓ Branch 4 taken 1436 times.
✓ Branch 5 taken 5744 times.
✓ Branch 6 taken 1436 times.
✓ Branch 7 taken 1436 times.
✓ Branch 8 taken 1436 times.
✓ Branch 9 taken 2872 times.
✓ Branch 10 taken 1436 times.
✓ Branch 11 taken 1436 times.
24412 for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) {
44 KLEIDICV_FORCE_LOOP_UNROLL
45
12/12
✓ Branch 0 taken 11488 times.
✓ Branch 1 taken 1436 times.
✓ Branch 2 taken 11488 times.
✓ Branch 3 taken 2872 times.
✓ Branch 4 taken 11488 times.
✓ Branch 5 taken 5744 times.
✓ Branch 6 taken 5744 times.
✓ Branch 7 taken 1436 times.
✓ Branch 8 taken 5744 times.
✓ Branch 9 taken 2872 times.
✓ Branch 10 taken 2872 times.
✓ Branch 11 taken 1436 times.
64620 for (size_t inner_i = 0; inner_i < half_order; ++inner_i) {
46 48824 dst_vectors[outer_i + inner_i] =
47 97648 vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
48 48824 reinterpret_cast<DstVectorType>(
49 48824 input[outer_i + inner_i + half_order]));
50
51 48824 dst_vectors[outer_i + half_order + inner_i] =
52 97648 vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
53 48824 reinterpret_cast<DstVectorType>(
54 48824 input[outer_i + inner_i + half_order]));
55 48824 }
56 15796 }
57 8616 }
58 14360 }
59
60 // Transposes one tile of data with vector instructions. The tile's width and
61 // height are the number of NEON lanes for the given type.
62 template <typename ScalarType>
63 5744 static void vector_path(Rows<const ScalarType> src_rows,
64 Rows<ScalarType> dst_rows) {
65 5744 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
66
67 // The number of vectors read and write is the same as the lane count of the
68 // given element size
69 5744 constexpr size_t buffer_size = num_of_lanes;
70
71 // Last transpose step is always done on 64 bit elements
72 5744 uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays)
73
74 // The 64 bit transpose spans through all the vectors, so its "order" is the
75 // same as the number of vectors
76 5744 constexpr size_t transpose_order_b64 = num_of_lanes;
77
78 5744 transpose_vectors_recursively<buffer_size, transpose_order_b64>(
79 5744 trn_result_b64, src_rows);
80
81 KLEIDICV_FORCE_LOOP_UNROLL
82
8/8
✓ Branch 0 taken 1436 times.
✓ Branch 1 taken 22976 times.
✓ Branch 2 taken 1436 times.
✓ Branch 3 taken 11488 times.
✓ Branch 4 taken 1436 times.
✓ Branch 5 taken 5744 times.
✓ Branch 6 taken 1436 times.
✓ Branch 7 taken 2872 times.
48824 for (size_t index = 0; index < buffer_size; ++index) {
83 43080 vst1q(&dst_rows.at(index)[0], trn_result_b64[index]);
84 43080 }
85 5744 }
86
87 template <typename ScalarType>
88 69 static void scalar_path(Rows<const ScalarType> src_rows,
89 Rows<ScalarType> dst_rows, size_t height,
90 size_t width) {
91
8/8
✓ Branch 0 taken 15 times.
✓ Branch 1 taken 191 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 98 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 50 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 26 times.
434 for (size_t vindex = 0; vindex < height; ++vindex) {
92 365 disable_loop_vectorization();
93
8/8
✓ Branch 0 taken 2940 times.
✓ Branch 1 taken 191 times.
✓ Branch 2 taken 703 times.
✓ Branch 3 taken 98 times.
✓ Branch 4 taken 159 times.
✓ Branch 5 taken 50 times.
✓ Branch 6 taken 31 times.
✓ Branch 7 taken 26 times.
4198 for (size_t hindex = 0; hindex < width; ++hindex) {
94 3833 disable_loop_vectorization();
95 3833 dst_rows.at(hindex)[vindex] = src_rows.at(vindex)[hindex];
96 3833 }
97 365 }
98 69 }
99
100 template <typename ScalarType>
101 153 static kleidicv_error_t transpose(Rectangle rect,
102 Rows<const ScalarType> src_rows,
103 Rows<ScalarType> dst_rows) {
104 153 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
105
106 713 auto handle_lane_number_of_rows = [&](size_t vindex) {
107 560 LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes);
108
109 4624 horizontal_loop.unroll_once([&](size_t hindex) {
110 // if the input is big enough handle it tile by tile
111 8128 vector_path<ScalarType>(src_rows.at(vindex, hindex),
112 4064 dst_rows.at(hindex, vindex));
113 4064 });
114
115 592 horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) {
116 64 scalar_path(src_rows.at(vindex, hindex), dst_rows.at(hindex, vindex),
117 32 num_of_lanes, final_hindex - hindex);
118 32 });
119 560 };
120
121 153 LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes);
122
123 153 vertical_loop.unroll_once(handle_lane_number_of_rows);
124
125 190 vertical_loop.remaining([&](size_t hindex, size_t final_hindex) {
126 74 scalar_path(src_rows.at(hindex), dst_rows.at(0, hindex),
127 37 final_hindex - hindex, rect.width());
128 37 });
129 153 return KLEIDICV_OK;
130 153 }
131
132 template <typename ScalarType>
133 72 static kleidicv_error_t transpose(Rectangle rect, Rows<ScalarType> data_rows) {
134 72 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
135
136 // rect.width() needs to be equal to rect.height()
137 72 LoopUnroll2 outer_loop(rect.width(), num_of_lanes);
138
139 312 outer_loop.unroll_once([&](size_t vindex) {
140 // Handle tiles on the diagonal line
141 480 vector_path<ScalarType>(data_rows.at(vindex, vindex),
142 240 data_rows.at(vindex, vindex));
143
144 // Handle the top right half
145
8/8
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 54 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 54 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 54 times.
240 if (rect.width() > (vindex + num_of_lanes)) {
146 // Indexes are running through only the top right half
147 216 LoopUnroll2 inner_loop(vindex + num_of_lanes, rect.width(), num_of_lanes);
148
149 936 inner_loop.unroll_once([&](size_t hindex) {
150 // Allocate temporary memory for one tile
151 720 ScalarType tmp[num_of_lanes * num_of_lanes]; // NOLINT(runtime/arrays)
152 720 Rows<ScalarType> tmp_rows{tmp, num_of_lanes * sizeof(ScalarType)};
153
154 // Transpose a tile from the top right area, save the result
155 // into temporary memory
156 720 vector_path<ScalarType>(data_rows.at(vindex, hindex), tmp_rows);
157 // Transpose its mirror tile from the left bottom area, save the
158 // result to its final space
159 1440 vector_path<ScalarType>(data_rows.at(hindex, vindex),
160 720 data_rows.at(vindex, hindex));
161 // Copy the temprory result to its final destination
162 1440 Rows<const ScalarType> const_tmp_rows{
163 720 tmp, num_of_lanes * sizeof(ScalarType)};
164 720 CopyNonOverlappingRows<ScalarType>::copy_rows(
165 720 Rectangle{num_of_lanes, num_of_lanes}, const_tmp_rows,
166 720 data_rows.at(hindex, vindex));
167 720 });
168
169 360 inner_loop.remaining([&](size_t hindex, size_t final_hindex) {
170 // As this is the unroll_once path of the outer_loop there is
171 // num_of_lanes worth of data in the vertical direction
172
8/8
✓ Branch 0 taken 36 times.
✓ Branch 1 taken 576 times.
✓ Branch 2 taken 36 times.
✓ Branch 3 taken 288 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 144 times.
✓ Branch 6 taken 36 times.
✓ Branch 7 taken 72 times.
1224 for (size_t i = vindex; i < vindex + num_of_lanes; ++i) {
173 1080 disable_loop_vectorization();
174
8/8
✓ Branch 0 taken 8640 times.
✓ Branch 1 taken 576 times.
✓ Branch 2 taken 2016 times.
✓ Branch 3 taken 288 times.
✓ Branch 4 taken 432 times.
✓ Branch 5 taken 144 times.
✓ Branch 6 taken 72 times.
✓ Branch 7 taken 72 times.
12240 for (size_t j = hindex; j < final_hindex; ++j) {
175 11160 disable_loop_vectorization();
176 11160 std::swap(data_rows.at(i)[j], data_rows.at(j)[i]);
177 11160 }
178 1080 }
179 144 });
180 216 }
181 240 });
182
183 120 outer_loop.remaining([&](size_t vindex, size_t final_vindex) {
184
8/8
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 180 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 36 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 12 times.
360 for (size_t i = vindex; i < final_vindex; ++i) {
185 312 disable_loop_vectorization();
186 // Only the top right half pixels need to be indexed
187
7/8
✓ Branch 0 taken 1260 times.
✓ Branch 1 taken 180 times.
✓ Branch 2 taken 252 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 36 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 12 times.
1860 for (size_t j = i + 1; j < final_vindex; ++j) {
188 1548 disable_loop_vectorization();
189 1548 std::swap(data_rows.at(i)[j], data_rows.at(j)[i]);
190 1548 }
191 312 }
192 48 });
193 72 return KLEIDICV_OK;
194 72 }
195
196 template <typename T>
197 321 static kleidicv_error_t transpose(const void *src_void, size_t src_stride,
198 void *dst_void, size_t dst_stride,
199 size_t src_width, size_t src_height) {
200
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 81 times.
321 MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void);
201
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 78 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 78 times.
312 MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void);
202
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 66 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 72 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 72 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 72 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 72 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 72 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 72 times.
303 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
203
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 63 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 63 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 66 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 66 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 66 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 66 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 66 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 66 times.
282 CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width);
204
24/24
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 60 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 57 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 57 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 63 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 60 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 60 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 63 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 60 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 60 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 63 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 60 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 60 times.
261 CHECK_IMAGE_SIZE(src_width, src_height);
205
206 237 Rectangle rect{src_width, src_height};
207 237 Rows<T> dst_rows{dst, dst_stride};
208
209
8/8
✓ Branch 0 taken 36 times.
✓ Branch 1 taken 21 times.
✓ Branch 2 taken 39 times.
✓ Branch 3 taken 21 times.
✓ Branch 4 taken 39 times.
✓ Branch 5 taken 21 times.
✓ Branch 6 taken 39 times.
✓ Branch 7 taken 21 times.
237 if (src == dst) {
210
8/8
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 18 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 18 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 18 times.
84 if (src_width != src_height) {
211 // Inplace transpose only implemented if width and height are the same
212 12 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
213 }
214 72 return transpose(rect, dst_rows);
215 }
216 153 Rows<const T> src_rows{src, src_stride};
217 153 return transpose(rect, src_rows, dst_rows);
218 321 }
219
220 KLEIDICV_TARGET_FN_ATTRS
221 324 kleidicv_error_t transpose(const void *src, size_t src_stride, void *dst,
222 size_t dst_stride, size_t src_width,
223 size_t src_height, size_t element_size) {
224
5/5
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 69 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
324 switch (element_size) {
225 case sizeof(uint8_t):
226 138 return transpose<uint8_t>(src, src_stride, dst, dst_stride, src_width,
227 69 src_height);
228 case sizeof(uint16_t):
229 168 return transpose<uint16_t>(src, src_stride, dst, dst_stride, src_width,
230 84 src_height);
231 case sizeof(uint32_t):
232 168 return transpose<uint32_t>(src, src_stride, dst, dst_stride, src_width,
233 84 src_height);
234 case sizeof(uint64_t):
235 168 return transpose<uint64_t>(src, src_stride, dst, dst_stride, src_width,
236 84 src_height);
237 default:
238 3 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
239 }
240 324 }
241
242 } // namespace kleidicv::neon
243