Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include "kleidicv/arithmetics/transpose.h" | ||
6 | #include "kleidicv/kleidicv.h" | ||
7 | #include "kleidicv/neon.h" | ||
8 | |||
9 | namespace kleidicv::neon { | ||
10 | |||
11 | template <const size_t BufferSize, const size_t Order, typename DstVectorType, | ||
12 | typename SrcType> | ||
13 | 14360 | static void transpose_vectors_recursively(DstVectorType *dst_vectors, | |
14 | Rows<const SrcType> src_rows) { | ||
15 | // order is halved at every recursive call, once it is 2 the recursion should | ||
16 | // stop and the input data needs to be read. | ||
17 | if constexpr (Order == 2) { | ||
18 | KLEIDICV_FORCE_LOOP_UNROLL | ||
19 |
8/8✓ Branch 0 taken 1436 times.
✓ Branch 1 taken 11488 times.
✓ Branch 2 taken 1436 times.
✓ Branch 3 taken 5744 times.
✓ Branch 4 taken 1436 times.
✓ Branch 5 taken 2872 times.
✓ Branch 6 taken 1436 times.
✓ Branch 7 taken 1436 times.
|
27284 | for (size_t index = 0; index < BufferSize; index += Order) { |
20 | using SrcVectorType = typename VecTraits<SrcType>::VectorType; | ||
21 | 21540 | SrcVectorType src_vector[2]; | |
22 | |||
23 | 21540 | src_vector[0] = vld1q(&src_rows.at(index + 0)[0]); | |
24 | 21540 | src_vector[1] = vld1q(&src_rows.at(index + 1)[0]); | |
25 | |||
26 | // If order is 2 than SrcVectorType is the same as DstVectorType | ||
27 | 21540 | dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]); | |
28 | 21540 | dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]); | |
29 | 21540 | } | |
30 | } else { | ||
31 | // First we need to create the input for the current transpose stage, which | ||
32 | // is the output of the previous stage. The previous stage transposes | ||
33 | // elements half the size of the current stage and its order is also half of | ||
34 | // the current one. | ||
35 | 8616 | half_element_width_t<DstVectorType> input[BufferSize]; | |
36 | 8616 | constexpr size_t previous_order = Order / 2; | |
37 | |||
38 | 8616 | transpose_vectors_recursively<BufferSize, previous_order>(input, src_rows); | |
39 | |||
40 | 8616 | constexpr size_t half_order = Order / 2; | |
41 | |||
42 | KLEIDICV_FORCE_LOOP_UNROLL | ||
43 |
12/12✓ Branch 0 taken 1436 times.
✓ Branch 1 taken 1436 times.
✓ Branch 2 taken 1436 times.
✓ Branch 3 taken 2872 times.
✓ Branch 4 taken 1436 times.
✓ Branch 5 taken 5744 times.
✓ Branch 6 taken 1436 times.
✓ Branch 7 taken 1436 times.
✓ Branch 8 taken 1436 times.
✓ Branch 9 taken 2872 times.
✓ Branch 10 taken 1436 times.
✓ Branch 11 taken 1436 times.
|
24412 | for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) { |
44 | KLEIDICV_FORCE_LOOP_UNROLL | ||
45 |
12/12✓ Branch 0 taken 11488 times.
✓ Branch 1 taken 1436 times.
✓ Branch 2 taken 11488 times.
✓ Branch 3 taken 2872 times.
✓ Branch 4 taken 11488 times.
✓ Branch 5 taken 5744 times.
✓ Branch 6 taken 5744 times.
✓ Branch 7 taken 1436 times.
✓ Branch 8 taken 5744 times.
✓ Branch 9 taken 2872 times.
✓ Branch 10 taken 2872 times.
✓ Branch 11 taken 1436 times.
|
64620 | for (size_t inner_i = 0; inner_i < half_order; ++inner_i) { |
46 | 48824 | dst_vectors[outer_i + inner_i] = | |
47 | 97648 | vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
48 | 48824 | reinterpret_cast<DstVectorType>( | |
49 | 48824 | input[outer_i + inner_i + half_order])); | |
50 | |||
51 | 48824 | dst_vectors[outer_i + half_order + inner_i] = | |
52 | 97648 | vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
53 | 48824 | reinterpret_cast<DstVectorType>( | |
54 | 48824 | input[outer_i + inner_i + half_order])); | |
55 | 48824 | } | |
56 | 15796 | } | |
57 | 8616 | } | |
58 | 14360 | } | |
59 | |||
60 | // Transposes one tile of data with vector instructions. The tile's width and | ||
61 | // height are the number of NEON lanes for the given type. | ||
62 | template <typename ScalarType> | ||
63 | 5744 | static void vector_path(Rows<const ScalarType> src_rows, | |
64 | Rows<ScalarType> dst_rows) { | ||
65 | 5744 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
66 | |||
67 | // The number of vectors read and write is the same as the lane count of the | ||
68 | // given element size | ||
69 | 5744 | constexpr size_t buffer_size = num_of_lanes; | |
70 | |||
71 | // Last transpose step is always done on 64 bit elements | ||
72 | 5744 | uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays) | |
73 | |||
74 | // The 64 bit transpose spans through all the vectors, so its "order" is the | ||
75 | // same as the number of vectors | ||
76 | 5744 | constexpr size_t transpose_order_b64 = num_of_lanes; | |
77 | |||
78 | 5744 | transpose_vectors_recursively<buffer_size, transpose_order_b64>( | |
79 | 5744 | trn_result_b64, src_rows); | |
80 | |||
81 | KLEIDICV_FORCE_LOOP_UNROLL | ||
82 |
8/8✓ Branch 0 taken 1436 times.
✓ Branch 1 taken 22976 times.
✓ Branch 2 taken 1436 times.
✓ Branch 3 taken 11488 times.
✓ Branch 4 taken 1436 times.
✓ Branch 5 taken 5744 times.
✓ Branch 6 taken 1436 times.
✓ Branch 7 taken 2872 times.
|
48824 | for (size_t index = 0; index < buffer_size; ++index) { |
83 | 43080 | vst1q(&dst_rows.at(index)[0], trn_result_b64[index]); | |
84 | 43080 | } | |
85 | 5744 | } | |
86 | |||
87 | template <typename ScalarType> | ||
88 | 69 | static void scalar_path(Rows<const ScalarType> src_rows, | |
89 | Rows<ScalarType> dst_rows, size_t height, | ||
90 | size_t width) { | ||
91 |
8/8✓ Branch 0 taken 15 times.
✓ Branch 1 taken 191 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 98 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 50 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 26 times.
|
434 | for (size_t vindex = 0; vindex < height; ++vindex) { |
92 | 365 | disable_loop_vectorization(); | |
93 |
8/8✓ Branch 0 taken 2940 times.
✓ Branch 1 taken 191 times.
✓ Branch 2 taken 703 times.
✓ Branch 3 taken 98 times.
✓ Branch 4 taken 159 times.
✓ Branch 5 taken 50 times.
✓ Branch 6 taken 31 times.
✓ Branch 7 taken 26 times.
|
4198 | for (size_t hindex = 0; hindex < width; ++hindex) { |
94 | 3833 | disable_loop_vectorization(); | |
95 | 3833 | dst_rows.at(hindex)[vindex] = src_rows.at(vindex)[hindex]; | |
96 | 3833 | } | |
97 | 365 | } | |
98 | 69 | } | |
99 | |||
100 | template <typename ScalarType> | ||
101 | 153 | static kleidicv_error_t transpose(Rectangle rect, | |
102 | Rows<const ScalarType> src_rows, | ||
103 | Rows<ScalarType> dst_rows) { | ||
104 | 153 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
105 | |||
106 | 713 | auto handle_lane_number_of_rows = [&](size_t vindex) { | |
107 | 560 | LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes); | |
108 | |||
109 | 4624 | horizontal_loop.unroll_once([&](size_t hindex) { | |
110 | // if the input is big enough handle it tile by tile | ||
111 | 8128 | vector_path<ScalarType>(src_rows.at(vindex, hindex), | |
112 | 4064 | dst_rows.at(hindex, vindex)); | |
113 | 4064 | }); | |
114 | |||
115 | 592 | horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
116 | 64 | scalar_path(src_rows.at(vindex, hindex), dst_rows.at(hindex, vindex), | |
117 | 32 | num_of_lanes, final_hindex - hindex); | |
118 | 32 | }); | |
119 | 560 | }; | |
120 | |||
121 | 153 | LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes); | |
122 | |||
123 | 153 | vertical_loop.unroll_once(handle_lane_number_of_rows); | |
124 | |||
125 | 190 | vertical_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
126 | 74 | scalar_path(src_rows.at(hindex), dst_rows.at(0, hindex), | |
127 | 37 | final_hindex - hindex, rect.width()); | |
128 | 37 | }); | |
129 | 153 | return KLEIDICV_OK; | |
130 | 153 | } | |
131 | |||
132 | template <typename ScalarType> | ||
133 | 72 | static kleidicv_error_t transpose(Rectangle rect, Rows<ScalarType> data_rows) { | |
134 | 72 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
135 | |||
136 | // rect.width() needs to be equal to rect.height() | ||
137 | 72 | LoopUnroll2 outer_loop(rect.width(), num_of_lanes); | |
138 | |||
139 | 312 | outer_loop.unroll_once([&](size_t vindex) { | |
140 | // Handle tiles on the diagonal line | ||
141 | 480 | vector_path<ScalarType>(data_rows.at(vindex, vindex), | |
142 | 240 | data_rows.at(vindex, vindex)); | |
143 | |||
144 | // Handle the top right half | ||
145 |
8/8✓ Branch 0 taken 6 times.
✓ Branch 1 taken 54 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 54 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 54 times.
|
240 | if (rect.width() > (vindex + num_of_lanes)) { |
146 | // Indexes are running through only the top right half | ||
147 | 216 | LoopUnroll2 inner_loop(vindex + num_of_lanes, rect.width(), num_of_lanes); | |
148 | |||
149 | 936 | inner_loop.unroll_once([&](size_t hindex) { | |
150 | // Allocate temporary memory for one tile | ||
151 | 720 | ScalarType tmp[num_of_lanes * num_of_lanes]; // NOLINT(runtime/arrays) | |
152 | 720 | Rows<ScalarType> tmp_rows{tmp, num_of_lanes * sizeof(ScalarType)}; | |
153 | |||
154 | // Transpose a tile from the top right area, save the result | ||
155 | // into temporary memory | ||
156 | 720 | vector_path<ScalarType>(data_rows.at(vindex, hindex), tmp_rows); | |
157 | // Transpose its mirror tile from the left bottom area, save the | ||
158 | // result to its final space | ||
159 | 1440 | vector_path<ScalarType>(data_rows.at(hindex, vindex), | |
160 | 720 | data_rows.at(vindex, hindex)); | |
161 | // Copy the temprory result to its final destination | ||
162 | 1440 | Rows<const ScalarType> const_tmp_rows{ | |
163 | 720 | tmp, num_of_lanes * sizeof(ScalarType)}; | |
164 | 720 | CopyNonOverlappingRows<ScalarType>::copy_rows( | |
165 | 720 | Rectangle{num_of_lanes, num_of_lanes}, const_tmp_rows, | |
166 | 720 | data_rows.at(hindex, vindex)); | |
167 | 720 | }); | |
168 | |||
169 | 360 | inner_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
170 | // As this is the unroll_once path of the outer_loop there is | ||
171 | // num_of_lanes worth of data in the vertical direction | ||
172 |
8/8✓ Branch 0 taken 36 times.
✓ Branch 1 taken 576 times.
✓ Branch 2 taken 36 times.
✓ Branch 3 taken 288 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 144 times.
✓ Branch 6 taken 36 times.
✓ Branch 7 taken 72 times.
|
1224 | for (size_t i = vindex; i < vindex + num_of_lanes; ++i) { |
173 | 1080 | disable_loop_vectorization(); | |
174 |
8/8✓ Branch 0 taken 8640 times.
✓ Branch 1 taken 576 times.
✓ Branch 2 taken 2016 times.
✓ Branch 3 taken 288 times.
✓ Branch 4 taken 432 times.
✓ Branch 5 taken 144 times.
✓ Branch 6 taken 72 times.
✓ Branch 7 taken 72 times.
|
12240 | for (size_t j = hindex; j < final_hindex; ++j) { |
175 | 11160 | disable_loop_vectorization(); | |
176 | 11160 | std::swap(data_rows.at(i)[j], data_rows.at(j)[i]); | |
177 | 11160 | } | |
178 | 1080 | } | |
179 | 144 | }); | |
180 | 216 | } | |
181 | 240 | }); | |
182 | |||
183 | 120 | outer_loop.remaining([&](size_t vindex, size_t final_vindex) { | |
184 |
8/8✓ Branch 0 taken 12 times.
✓ Branch 1 taken 180 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 36 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 12 times.
|
360 | for (size_t i = vindex; i < final_vindex; ++i) { |
185 | 312 | disable_loop_vectorization(); | |
186 | // Only the top right half pixels need to be indexed | ||
187 |
7/8✓ Branch 0 taken 1260 times.
✓ Branch 1 taken 180 times.
✓ Branch 2 taken 252 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 36 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 12 times.
|
1860 | for (size_t j = i + 1; j < final_vindex; ++j) { |
188 | 1548 | disable_loop_vectorization(); | |
189 | 1548 | std::swap(data_rows.at(i)[j], data_rows.at(j)[i]); | |
190 | 1548 | } | |
191 | 312 | } | |
192 | 48 | }); | |
193 | 72 | return KLEIDICV_OK; | |
194 | 72 | } | |
195 | |||
196 | template <typename T> | ||
197 | 321 | static kleidicv_error_t transpose(const void *src_void, size_t src_stride, | |
198 | void *dst_void, size_t dst_stride, | ||
199 | size_t src_width, size_t src_height) { | ||
200 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 81 times.
|
321 | MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void); |
201 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 78 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 78 times.
|
312 | MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void); |
202 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 66 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 72 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 72 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 72 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 72 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 72 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 72 times.
|
303 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
203 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 63 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 63 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 66 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 66 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 66 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 66 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 66 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 66 times.
|
282 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width); |
204 |
24/24✓ Branch 0 taken 3 times.
✓ Branch 1 taken 60 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 57 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 57 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 63 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 60 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 60 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 63 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 60 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 60 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 63 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 60 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 60 times.
|
261 | CHECK_IMAGE_SIZE(src_width, src_height); |
205 | |||
206 | 237 | Rectangle rect{src_width, src_height}; | |
207 | 237 | Rows<T> dst_rows{dst, dst_stride}; | |
208 | |||
209 |
8/8✓ Branch 0 taken 36 times.
✓ Branch 1 taken 21 times.
✓ Branch 2 taken 39 times.
✓ Branch 3 taken 21 times.
✓ Branch 4 taken 39 times.
✓ Branch 5 taken 21 times.
✓ Branch 6 taken 39 times.
✓ Branch 7 taken 21 times.
|
237 | if (src == dst) { |
210 |
8/8✓ Branch 0 taken 3 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 18 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 18 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 18 times.
|
84 | if (src_width != src_height) { |
211 | // Inplace transpose only implemented if width and height are the same | ||
212 | 12 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
213 | } | ||
214 | 72 | return transpose(rect, dst_rows); | |
215 | } | ||
216 | 153 | Rows<const T> src_rows{src, src_stride}; | |
217 | 153 | return transpose(rect, src_rows, dst_rows); | |
218 | 321 | } | |
219 | |||
220 | KLEIDICV_TARGET_FN_ATTRS | ||
221 | 324 | kleidicv_error_t transpose(const void *src, size_t src_stride, void *dst, | |
222 | size_t dst_stride, size_t src_width, | ||
223 | size_t src_height, size_t element_size) { | ||
224 |
5/5✓ Branch 0 taken 84 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 69 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
|
324 | switch (element_size) { |
225 | case sizeof(uint8_t): | ||
226 | 138 | return transpose<uint8_t>(src, src_stride, dst, dst_stride, src_width, | |
227 | 69 | src_height); | |
228 | case sizeof(uint16_t): | ||
229 | 168 | return transpose<uint16_t>(src, src_stride, dst, dst_stride, src_width, | |
230 | 84 | src_height); | |
231 | case sizeof(uint32_t): | ||
232 | 168 | return transpose<uint32_t>(src, src_stride, dst, dst_stride, src_width, | |
233 | 84 | src_height); | |
234 | case sizeof(uint64_t): | ||
235 | 168 | return transpose<uint64_t>(src, src_stride, dst, dst_stride, src_width, | |
236 | 84 | src_height); | |
237 | default: | ||
238 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
239 | } | ||
240 | 324 | } | |
241 | |||
242 | } // namespace kleidicv::neon | ||
243 |